blob: 5ba99514d29691a6a9399be7025402d28a3c1389 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020047#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinnerbcda8f12018-11-21 22:27:47 +010048#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020049#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040050#include "pycore_pylifecycle.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020051#include "pycore_pystate.h" // _PyInterpreterState_GET()
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000052#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070053#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000054
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000055#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000056#include <windows.h>
57#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000058
Victor Stinnerfecc4f22019-03-19 14:20:29 +010059/* Uncomment to display statistics on interned strings at exit when
60 using Valgrind or Insecure++. */
61/* #define INTERNED_STATS 1 */
62
63
Larry Hastings61272b72014-01-07 12:41:53 -080064/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090065class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080066[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090067/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
68
69/*[python input]
70class Py_UCS4_converter(CConverter):
71 type = 'Py_UCS4'
72 converter = 'convert_uc'
73
74 def converter_init(self):
75 if self.default is not unspecified:
76 self.c_default = ascii(self.default)
77 if len(self.c_default) > 4 or self.c_default[0] != "'":
78 self.c_default = hex(ord(self.default))
79
80[python start generated code]*/
81/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080082
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
Serhiy Storchaka05997252013-01-26 12:14:02 +020085NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner8faf8212011-12-08 22:14:11 +010096/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97#define MAX_UNICODE 0x10ffff
98
Victor Stinner910337b2011-10-03 03:20:16 +020099#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200100# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#else
102# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
103#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200104
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200105#define _PyUnicode_UTF8(op) \
106 (((PyCompactUnicodeObject*)(op))->utf8)
107#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200108 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200109 assert(PyUnicode_IS_READY(op)), \
110 PyUnicode_IS_COMPACT_ASCII(op) ? \
111 ((char*)((PyASCIIObject*)(op) + 1)) : \
112 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200113#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 (((PyCompactUnicodeObject*)(op))->utf8_length)
115#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200116 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200117 assert(PyUnicode_IS_READY(op)), \
118 PyUnicode_IS_COMPACT_ASCII(op) ? \
119 ((PyASCIIObject*)(op))->length : \
120 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200121#define _PyUnicode_WSTR(op) \
122 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900123
124/* Don't use deprecated macro of unicodeobject.h */
125#undef PyUnicode_WSTR_LENGTH
126#define PyUnicode_WSTR_LENGTH(op) \
127 (PyUnicode_IS_COMPACT_ASCII(op) ? \
128 ((PyASCIIObject*)op)->length : \
129 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_WSTR_LENGTH(op) \
131 (((PyCompactUnicodeObject*)(op))->wstr_length)
132#define _PyUnicode_LENGTH(op) \
133 (((PyASCIIObject *)(op))->length)
134#define _PyUnicode_STATE(op) \
135 (((PyASCIIObject *)(op))->state)
136#define _PyUnicode_HASH(op) \
137 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200138#define _PyUnicode_KIND(op) \
139 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200140 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200141#define _PyUnicode_GET_LENGTH(op) \
142 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200143 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200144#define _PyUnicode_DATA_ANY(op) \
145 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146
Victor Stinner910337b2011-10-03 03:20:16 +0200147#undef PyUnicode_READY
148#define PyUnicode_READY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200151 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100152 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200153
Victor Stinnerc379ead2011-10-03 12:52:27 +0200154#define _PyUnicode_SHARE_UTF8(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
157 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
158#define _PyUnicode_SHARE_WSTR(op) \
159 (assert(_PyUnicode_CHECK(op)), \
160 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
161
Victor Stinner829c0ad2011-10-03 01:08:02 +0200162/* true if the Unicode object has an allocated UTF-8 memory block
163 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200164#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200165 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
168
Victor Stinner03490912011-10-03 23:45:12 +0200169/* true if the Unicode object has an allocated wstr memory block
170 (not shared with other data) */
171#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200172 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200173 (!PyUnicode_IS_READY(op) || \
174 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
175
Victor Stinner910337b2011-10-03 03:20:16 +0200176/* Generic helper macro to convert characters of different types.
177 from_type and to_type have to be valid type names, begin and end
178 are pointers to the source characters which should be of type
179 "from_type *". to is a pointer of type "to_type *" and points to the
180 buffer where the result characters are written to. */
181#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
182 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100183 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600184 const from_type *_iter = (const from_type *)(begin);\
185 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 Py_ssize_t n = (_end) - (_iter); \
187 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200188 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200189 while (_iter < (_unrolled_end)) { \
190 _to[0] = (to_type) _iter[0]; \
191 _to[1] = (to_type) _iter[1]; \
192 _to[2] = (to_type) _iter[2]; \
193 _to[3] = (to_type) _iter[3]; \
194 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200196 while (_iter < (_end)) \
197 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200198 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200199
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200200#ifdef MS_WINDOWS
201 /* On Windows, overallocate by 50% is the best factor */
202# define OVERALLOCATE_FACTOR 2
203#else
204 /* On Linux, overallocate by 25% is the best factor */
205# define OVERALLOCATE_FACTOR 4
206#endif
207
Victor Stinner607b1022020-05-05 18:50:30 +0200208/* bpo-40521: Interned strings are shared by all interpreters. */
209#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
210# define INTERNED_STRINGS
211#endif
212
Walter Dörwald16807132007-05-25 13:52:07 +0000213/* This dictionary holds all interned unicode strings. Note that references
214 to strings in this dictionary are *not* counted in the string's ob_refcnt.
215 When the interned string reaches a refcnt of 0 the string deallocation
216 function will delete the reference from this dictionary.
217
218 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000219 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000220*/
Victor Stinner607b1022020-05-05 18:50:30 +0200221#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200223#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000224
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200225static struct _Py_unicode_state*
226get_unicode_state(void)
227{
228 PyInterpreterState *interp = _PyInterpreterState_GET();
229 return &interp->unicode;
230}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200231
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000232
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200234static inline PyObject* unicode_get_empty(void)
235{
236 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200237 // unicode_get_empty() must not be called before _PyUnicode_Init()
238 // or after _PyUnicode_Fini()
239 assert(state->empty != NULL);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240 return state->empty;
241}
242
243static inline PyObject* unicode_new_empty(void)
244{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200245 PyObject *empty = unicode_get_empty();
246 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200247 return empty;
248}
249
250#define _Py_RETURN_UNICODE_EMPTY() \
251 do { \
252 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200253 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254
Victor Stinner59423e32018-11-26 13:40:01 +0100255static inline void
256unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
257 Py_ssize_t start, Py_ssize_t length)
258{
259 assert(0 <= start);
260 assert(kind != PyUnicode_WCHAR_KIND);
261 switch (kind) {
262 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100263 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100264 Py_UCS1 ch = (unsigned char)value;
265 Py_UCS1 *to = (Py_UCS1 *)data + start;
266 memset(to, ch, length);
267 break;
268 }
269 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100270 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100271 Py_UCS2 ch = (Py_UCS2)value;
272 Py_UCS2 *to = (Py_UCS2 *)data + start;
273 const Py_UCS2 *end = to + length;
274 for (; to < end; ++to) *to = ch;
275 break;
276 }
277 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100278 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100279 Py_UCS4 ch = value;
280 Py_UCS4 * to = (Py_UCS4 *)data + start;
281 const Py_UCS4 *end = to + length;
282 for (; to < end; ++to) *to = ch;
283 break;
284 }
285 default: Py_UNREACHABLE();
286 }
287}
288
289
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200290/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700291static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200292_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900293static inline void
294_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400295static PyObject *
296unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
297 const char *errors);
298static PyObject *
299unicode_decode_utf8(const char *s, Py_ssize_t size,
300 _Py_error_handler error_handler, const char *errors,
301 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200302
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200303/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200304static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200305
Christian Heimes190d79e2008-01-30 11:58:22 +0000306/* Fast detection of the most frequent whitespace characters */
307const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000308 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000309/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000310/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000311/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000312/* case 0x000C: * FORM FEED */
313/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000314 0, 1, 1, 1, 1, 1, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000316/* case 0x001C: * FILE SEPARATOR */
317/* case 0x001D: * GROUP SEPARATOR */
318/* case 0x001E: * RECORD SEPARATOR */
319/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000321/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000322 1, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000326
Benjamin Peterson14339b62009-01-31 16:36:08 +0000327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000335};
336
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200337/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200338static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200339static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100340static int unicode_modifiable(PyObject *unicode);
341
Victor Stinnerfe226c02011-10-03 03:52:20 +0200342
Alexander Belopolsky40018472011-02-26 01:02:56 +0000343static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100344_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200345static PyObject *
346_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347static PyObject *
348_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349
350static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000351unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000352 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100353 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000354 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355
Alexander Belopolsky40018472011-02-26 01:02:56 +0000356static void
357raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300358 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100359 PyObject *unicode,
360 Py_ssize_t startpos, Py_ssize_t endpos,
361 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000362
Christian Heimes190d79e2008-01-30 11:58:22 +0000363/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200364static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000366/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000367/* 0x000B, * LINE TABULATION */
368/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000369/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000370 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000371 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000372/* 0x001C, * FILE SEPARATOR */
373/* 0x001D, * GROUP SEPARATOR */
374/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000375 0, 0, 0, 0, 1, 1, 1, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000380
Benjamin Peterson14339b62009-01-31 16:36:08 +0000381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000389};
390
INADA Naoki3ae20562017-01-16 20:41:20 +0900391static int convert_uc(PyObject *obj, void *addr);
392
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300393#include "clinic/unicodeobject.c.h"
394
Victor Stinner3d4226a2018-08-29 22:21:32 +0200395_Py_error_handler
396_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200397{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200400 }
401 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200402 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200403 }
404 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200405 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200406 }
407 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200408 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200409 }
410 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200411 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200412 }
413 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200414 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200415 }
416 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200417 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200418 }
Victor Stinner50149202015-09-22 00:26:54 +0200419 return _Py_ERROR_OTHER;
420}
421
Victor Stinner709d23d2019-05-02 14:56:30 -0400422
423static _Py_error_handler
424get_error_handler_wide(const wchar_t *errors)
425{
426 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
427 return _Py_ERROR_STRICT;
428 }
429 if (wcscmp(errors, L"surrogateescape") == 0) {
430 return _Py_ERROR_SURROGATEESCAPE;
431 }
432 if (wcscmp(errors, L"replace") == 0) {
433 return _Py_ERROR_REPLACE;
434 }
435 if (wcscmp(errors, L"ignore") == 0) {
436 return _Py_ERROR_IGNORE;
437 }
438 if (wcscmp(errors, L"backslashreplace") == 0) {
439 return _Py_ERROR_BACKSLASHREPLACE;
440 }
441 if (wcscmp(errors, L"surrogatepass") == 0) {
442 return _Py_ERROR_SURROGATEPASS;
443 }
444 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
445 return _Py_ERROR_XMLCHARREFREPLACE;
446 }
447 return _Py_ERROR_OTHER;
448}
449
450
Victor Stinner22eb6892019-06-26 00:51:05 +0200451static inline int
452unicode_check_encoding_errors(const char *encoding, const char *errors)
453{
454 if (encoding == NULL && errors == NULL) {
455 return 0;
456 }
457
Victor Stinner81a7be32020-04-14 15:14:01 +0200458 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200459#ifndef Py_DEBUG
460 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200461 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200462 return 0;
463 }
464#else
465 /* Always check in debug mode */
466#endif
467
468 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
469 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200470 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200471 return 0;
472 }
473
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200474 /* Disable checks during Python finalization. For example, it allows to
475 call _PyObject_Dump() during finalization for debugging purpose. */
476 if (interp->finalizing) {
477 return 0;
478 }
479
Victor Stinner22eb6892019-06-26 00:51:05 +0200480 if (encoding != NULL) {
481 PyObject *handler = _PyCodec_Lookup(encoding);
482 if (handler == NULL) {
483 return -1;
484 }
485 Py_DECREF(handler);
486 }
487
488 if (errors != NULL) {
489 PyObject *handler = PyCodec_LookupError(errors);
490 if (handler == NULL) {
491 return -1;
492 }
493 Py_DECREF(handler);
494 }
495 return 0;
496}
497
498
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300499/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
500 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000501Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000502PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000503{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000504#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000505 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000506#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000507 /* This is actually an illegal character, so it should
508 not be passed to unichr. */
509 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000510#endif
511}
512
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200513int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100514_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200515{
Victor Stinner68762572019-10-07 18:42:01 +0200516#define CHECK(expr) \
517 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
518
Victor Stinner910337b2011-10-03 03:20:16 +0200519 PyASCIIObject *ascii;
520 unsigned int kind;
521
Victor Stinner68762572019-10-07 18:42:01 +0200522 assert(op != NULL);
523 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200524
525 ascii = (PyASCIIObject *)op;
526 kind = ascii->state.kind;
527
Victor Stinnera3b334d2011-10-03 13:53:37 +0200528 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200529 CHECK(kind == PyUnicode_1BYTE_KIND);
530 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200531 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200532 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200533 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200534 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200535
Victor Stinnera41463c2011-10-04 01:05:08 +0200536 if (ascii->state.compact == 1) {
537 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200539 || kind == PyUnicode_2BYTE_KIND
540 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200541 CHECK(ascii->state.ascii == 0);
542 CHECK(ascii->state.ready == 1);
543 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100544 }
545 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
547
548 data = unicode->data.any;
549 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200550 CHECK(ascii->length == 0);
551 CHECK(ascii->hash == -1);
552 CHECK(ascii->state.compact == 0);
553 CHECK(ascii->state.ascii == 0);
554 CHECK(ascii->state.ready == 0);
555 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
556 CHECK(ascii->wstr != NULL);
557 CHECK(data == NULL);
558 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200559 }
560 else {
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200562 || kind == PyUnicode_2BYTE_KIND
563 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(ascii->state.compact == 0);
565 CHECK(ascii->state.ready == 1);
566 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->utf8 == data);
569 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 }
571 else
Victor Stinner68762572019-10-07 18:42:01 +0200572 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 }
574 }
575 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200576 if (
577#if SIZEOF_WCHAR_T == 2
578 kind == PyUnicode_2BYTE_KIND
579#else
580 kind == PyUnicode_4BYTE_KIND
581#endif
582 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200583 {
Victor Stinner68762572019-10-07 18:42:01 +0200584 CHECK(ascii->wstr == data);
585 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200586 } else
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200588 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200589
590 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200591 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200592 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200594 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200595
596 /* check that the best kind is used: O(n) operation */
597 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200598 Py_ssize_t i;
599 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300600 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200601 Py_UCS4 ch;
602
603 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 for (i=0; i < ascii->length; i++)
605 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200606 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200607 if (ch > maxchar)
608 maxchar = ch;
609 }
610 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100611 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200612 CHECK(maxchar >= 128);
613 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100614 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200615 else
Victor Stinner68762572019-10-07 18:42:01 +0200616 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200617 }
Victor Stinner77faf692011-11-20 18:56:05 +0100618 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200619 CHECK(maxchar >= 0x100);
620 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100621 }
622 else {
Victor Stinner68762572019-10-07 18:42:01 +0200623 CHECK(maxchar >= 0x10000);
624 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100625 }
Victor Stinner68762572019-10-07 18:42:01 +0200626 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200627 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400628 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200629
630#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400631}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200632
Victor Stinner910337b2011-10-03 03:20:16 +0200633
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100634static PyObject*
635unicode_result_wchar(PyObject *unicode)
636{
637#ifndef Py_DEBUG
638 Py_ssize_t len;
639
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100640 len = _PyUnicode_WSTR_LENGTH(unicode);
641 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100642 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200643 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100644 }
645
646 if (len == 1) {
647 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100648 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100649 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200650 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 }
652 }
653
654 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200655 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100656 return NULL;
657 }
658#else
Victor Stinneraa771272012-10-04 02:32:58 +0200659 assert(Py_REFCNT(unicode) == 1);
660
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 /* don't make the result ready in debug mode to ensure that the caller
662 makes the string ready before using it */
663 assert(_PyUnicode_CheckConsistency(unicode, 1));
664#endif
665 return unicode;
666}
667
668static PyObject*
669unicode_result_ready(PyObject *unicode)
670{
671 Py_ssize_t length;
672
673 length = PyUnicode_GET_LENGTH(unicode);
674 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200675 PyObject *empty = unicode_get_empty();
676 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200678 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100679 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200680 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100681 }
682
683 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200684 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200685 if (kind == PyUnicode_1BYTE_KIND) {
686 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
687 Py_UCS1 ch = data[0];
688 struct _Py_unicode_state *state = get_unicode_state();
689 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100690 if (latin1_char != NULL) {
691 if (unicode != latin1_char) {
692 Py_INCREF(latin1_char);
693 Py_DECREF(unicode);
694 }
695 return latin1_char;
696 }
697 else {
698 assert(_PyUnicode_CheckConsistency(unicode, 1));
699 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200700 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100701 return unicode;
702 }
703 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200704 else {
705 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
706 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100707 }
708
709 assert(_PyUnicode_CheckConsistency(unicode, 1));
710 return unicode;
711}
712
713static PyObject*
714unicode_result(PyObject *unicode)
715{
716 assert(_PyUnicode_CHECK(unicode));
717 if (PyUnicode_IS_READY(unicode))
718 return unicode_result_ready(unicode);
719 else
720 return unicode_result_wchar(unicode);
721}
722
Victor Stinnerc4b49542011-12-11 22:44:26 +0100723static PyObject*
724unicode_result_unchanged(PyObject *unicode)
725{
726 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500727 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100728 return NULL;
729 Py_INCREF(unicode);
730 return unicode;
731 }
732 else
733 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100734 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100735}
736
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200737/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
738 ASCII, Latin1, UTF-8, etc. */
739static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200740backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200741 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
742{
Victor Stinnerad771582015-10-09 12:38:53 +0200743 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200744 Py_UCS4 ch;
745 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300746 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 assert(PyUnicode_IS_READY(unicode));
749 kind = PyUnicode_KIND(unicode);
750 data = PyUnicode_DATA(unicode);
751
752 size = 0;
753 /* determine replacement size */
754 for (i = collstart; i < collend; ++i) {
755 Py_ssize_t incr;
756
757 ch = PyUnicode_READ(kind, data, i);
758 if (ch < 0x100)
759 incr = 2+2;
760 else if (ch < 0x10000)
761 incr = 2+4;
762 else {
763 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200764 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200765 }
766 if (size > PY_SSIZE_T_MAX - incr) {
767 PyErr_SetString(PyExc_OverflowError,
768 "encoded result is too long for a Python string");
769 return NULL;
770 }
771 size += incr;
772 }
773
Victor Stinnerad771582015-10-09 12:38:53 +0200774 str = _PyBytesWriter_Prepare(writer, str, size);
775 if (str == NULL)
776 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777
778 /* generate replacement */
779 for (i = collstart; i < collend; ++i) {
780 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200781 *str++ = '\\';
782 if (ch >= 0x00010000) {
783 *str++ = 'U';
784 *str++ = Py_hexdigits[(ch>>28)&0xf];
785 *str++ = Py_hexdigits[(ch>>24)&0xf];
786 *str++ = Py_hexdigits[(ch>>20)&0xf];
787 *str++ = Py_hexdigits[(ch>>16)&0xf];
788 *str++ = Py_hexdigits[(ch>>12)&0xf];
789 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200790 }
Victor Stinner797485e2015-10-09 03:17:30 +0200791 else if (ch >= 0x100) {
792 *str++ = 'u';
793 *str++ = Py_hexdigits[(ch>>12)&0xf];
794 *str++ = Py_hexdigits[(ch>>8)&0xf];
795 }
796 else
797 *str++ = 'x';
798 *str++ = Py_hexdigits[(ch>>4)&0xf];
799 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200800 }
801 return str;
802}
803
804/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
805 ASCII, Latin1, UTF-8, etc. */
806static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200807xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200808 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
809{
Victor Stinnerad771582015-10-09 12:38:53 +0200810 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200811 Py_UCS4 ch;
812 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300813 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200814
815 assert(PyUnicode_IS_READY(unicode));
816 kind = PyUnicode_KIND(unicode);
817 data = PyUnicode_DATA(unicode);
818
819 size = 0;
820 /* determine replacement size */
821 for (i = collstart; i < collend; ++i) {
822 Py_ssize_t incr;
823
824 ch = PyUnicode_READ(kind, data, i);
825 if (ch < 10)
826 incr = 2+1+1;
827 else if (ch < 100)
828 incr = 2+2+1;
829 else if (ch < 1000)
830 incr = 2+3+1;
831 else if (ch < 10000)
832 incr = 2+4+1;
833 else if (ch < 100000)
834 incr = 2+5+1;
835 else if (ch < 1000000)
836 incr = 2+6+1;
837 else {
838 assert(ch <= MAX_UNICODE);
839 incr = 2+7+1;
840 }
841 if (size > PY_SSIZE_T_MAX - incr) {
842 PyErr_SetString(PyExc_OverflowError,
843 "encoded result is too long for a Python string");
844 return NULL;
845 }
846 size += incr;
847 }
848
Victor Stinnerad771582015-10-09 12:38:53 +0200849 str = _PyBytesWriter_Prepare(writer, str, size);
850 if (str == NULL)
851 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200852
853 /* generate replacement */
854 for (i = collstart; i < collend; ++i) {
855 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
856 }
857 return str;
858}
859
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860/* --- Bloom Filters ----------------------------------------------------- */
861
862/* stuff to implement simple "bloom filters" for Unicode characters.
863 to keep things simple, we use a single bitmask, using the least 5
864 bits from each unicode characters as the bit index. */
865
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200866/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000867
Antoine Pitrouf068f942010-01-13 14:19:12 +0000868#if LONG_BIT >= 128
869#define BLOOM_WIDTH 128
870#elif LONG_BIT >= 64
871#define BLOOM_WIDTH 64
872#elif LONG_BIT >= 32
873#define BLOOM_WIDTH 32
874#else
875#error "LONG_BIT is smaller than 32"
876#endif
877
Thomas Wouters477c8d52006-05-27 19:21:47 +0000878#define BLOOM_MASK unsigned long
879
Serhiy Storchaka05997252013-01-26 12:14:02 +0200880static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
Antoine Pitrouf068f942010-01-13 14:19:12 +0000882#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000883
Benjamin Peterson29060642009-01-31 22:14:21 +0000884#define BLOOM_LINEBREAK(ch) \
885 ((ch) < 128U ? ascii_linebreak[(ch)] : \
886 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000887
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700888static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300889make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890{
Victor Stinnera85af502013-04-09 21:53:54 +0200891#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
892 do { \
893 TYPE *data = (TYPE *)PTR; \
894 TYPE *end = data + LEN; \
895 Py_UCS4 ch; \
896 for (; data != end; data++) { \
897 ch = *data; \
898 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
899 } \
900 break; \
901 } while (0)
902
Thomas Wouters477c8d52006-05-27 19:21:47 +0000903 /* calculate simple bloom-style bitmask for a given unicode string */
904
Antoine Pitrouf068f942010-01-13 14:19:12 +0000905 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000906
907 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200908 switch (kind) {
909 case PyUnicode_1BYTE_KIND:
910 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
911 break;
912 case PyUnicode_2BYTE_KIND:
913 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
914 break;
915 case PyUnicode_4BYTE_KIND:
916 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
917 break;
918 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700919 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200920 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000921 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200922
923#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000924}
925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300926static int
927ensure_unicode(PyObject *obj)
928{
929 if (!PyUnicode_Check(obj)) {
930 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200931 "must be str, not %.100s",
932 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300933 return -1;
934 }
935 return PyUnicode_READY(obj);
936}
937
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200938/* Compilation of templated routines */
939
Victor Stinner90ed8a62020-06-24 00:34:07 +0200940#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200941
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200942#include "stringlib/asciilib.h"
943#include "stringlib/fastsearch.h"
944#include "stringlib/partition.h"
945#include "stringlib/split.h"
946#include "stringlib/count.h"
947#include "stringlib/find.h"
948#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200949#include "stringlib/undef.h"
950
951#include "stringlib/ucs1lib.h"
952#include "stringlib/fastsearch.h"
953#include "stringlib/partition.h"
954#include "stringlib/split.h"
955#include "stringlib/count.h"
956#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300957#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200958#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200959#include "stringlib/undef.h"
960
961#include "stringlib/ucs2lib.h"
962#include "stringlib/fastsearch.h"
963#include "stringlib/partition.h"
964#include "stringlib/split.h"
965#include "stringlib/count.h"
966#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300967#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200968#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200969#include "stringlib/undef.h"
970
971#include "stringlib/ucs4lib.h"
972#include "stringlib/fastsearch.h"
973#include "stringlib/partition.h"
974#include "stringlib/split.h"
975#include "stringlib/count.h"
976#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300977#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200978#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200979#include "stringlib/undef.h"
980
Inada Naoki2c4928d2020-06-17 20:09:44 +0900981_Py_COMP_DIAG_PUSH
982_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200983#include "stringlib/unicodedefs.h"
984#include "stringlib/fastsearch.h"
985#include "stringlib/count.h"
986#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100987#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900988_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200989
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200990#undef STRINGLIB_GET_EMPTY
991
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992/* --- Unicode Object ----------------------------------------------------- */
993
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700994static inline Py_ssize_t
995findchar(const void *s, int kind,
996 Py_ssize_t size, Py_UCS4 ch,
997 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200999 switch (kind) {
1000 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001001 if ((Py_UCS1) ch != ch)
1002 return -1;
1003 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001004 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001005 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001006 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001007 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001008 if ((Py_UCS2) ch != ch)
1009 return -1;
1010 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001011 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001012 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001013 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001014 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001015 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001016 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001017 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001018 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001019 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001020 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022}
1023
Victor Stinnerafffce42012-10-03 23:03:17 +02001024#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001025/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001026 earlier.
1027
1028 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1029 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1030 invalid character in Unicode 6.0. */
1031static void
1032unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1033{
1034 int kind = PyUnicode_KIND(unicode);
1035 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1036 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1037 if (length <= old_length)
1038 return;
1039 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1040}
1041#endif
1042
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043static PyObject*
1044resize_compact(PyObject *unicode, Py_ssize_t length)
1045{
1046 Py_ssize_t char_size;
1047 Py_ssize_t struct_size;
1048 Py_ssize_t new_size;
1049 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001050 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001051#ifdef Py_DEBUG
1052 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1053#endif
1054
Victor Stinner79891572012-05-03 13:43:07 +02001055 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001057 assert(PyUnicode_IS_COMPACT(unicode));
1058
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001059 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001060 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001061 struct_size = sizeof(PyASCIIObject);
1062 else
1063 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1067 PyErr_NoMemory();
1068 return NULL;
1069 }
1070 new_size = (struct_size + (length + 1) * char_size);
1071
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001072 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1073 PyObject_DEL(_PyUnicode_UTF8(unicode));
1074 _PyUnicode_UTF8(unicode) = NULL;
1075 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1076 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001077#ifdef Py_REF_DEBUG
1078 _Py_RefTotal--;
1079#endif
1080#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001081 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001082#endif
Victor Stinner84def372011-12-11 20:04:56 +01001083
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001084 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001085 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001086 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001087 PyErr_NoMemory();
1088 return NULL;
1089 }
Victor Stinner84def372011-12-11 20:04:56 +01001090 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001092
Victor Stinnerfe226c02011-10-03 03:52:20 +02001093 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001094 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001096 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001097 _PyUnicode_WSTR_LENGTH(unicode) = length;
1098 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001099 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1100 PyObject_DEL(_PyUnicode_WSTR(unicode));
1101 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001102 if (!PyUnicode_IS_ASCII(unicode))
1103 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001104 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001105#ifdef Py_DEBUG
1106 unicode_fill_invalid(unicode, old_length);
1107#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1109 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001110 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 return unicode;
1112}
1113
Alexander Belopolsky40018472011-02-26 01:02:56 +00001114static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001115resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116{
Victor Stinner95663112011-10-04 01:03:50 +02001117 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001118 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001120 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001121
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 if (PyUnicode_IS_READY(unicode)) {
1123 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001124 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001126#ifdef Py_DEBUG
1127 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1128#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129
1130 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001131 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001132 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1133 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134
1135 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1136 PyErr_NoMemory();
1137 return -1;
1138 }
1139 new_size = (length + 1) * char_size;
1140
Victor Stinner7a9105a2011-12-12 00:13:42 +01001141 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1142 {
1143 PyObject_DEL(_PyUnicode_UTF8(unicode));
1144 _PyUnicode_UTF8(unicode) = NULL;
1145 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1146 }
1147
Victor Stinnerfe226c02011-10-03 03:52:20 +02001148 data = (PyObject *)PyObject_REALLOC(data, new_size);
1149 if (data == NULL) {
1150 PyErr_NoMemory();
1151 return -1;
1152 }
1153 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001154 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001155 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001156 _PyUnicode_WSTR_LENGTH(unicode) = length;
1157 }
1158 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001159 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001160 _PyUnicode_UTF8_LENGTH(unicode) = length;
1161 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001162 _PyUnicode_LENGTH(unicode) = length;
1163 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001164#ifdef Py_DEBUG
1165 unicode_fill_invalid(unicode, old_length);
1166#endif
Victor Stinner95663112011-10-04 01:03:50 +02001167 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001168 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001169 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001170 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171 }
Victor Stinner95663112011-10-04 01:03:50 +02001172 assert(_PyUnicode_WSTR(unicode) != NULL);
1173
1174 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001175 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001176 PyErr_NoMemory();
1177 return -1;
1178 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001179 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001180 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001181 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001182 if (!wstr) {
1183 PyErr_NoMemory();
1184 return -1;
1185 }
1186 _PyUnicode_WSTR(unicode) = wstr;
1187 _PyUnicode_WSTR(unicode)[length] = 0;
1188 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001189 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 return 0;
1191}
1192
Victor Stinnerfe226c02011-10-03 03:52:20 +02001193static PyObject*
1194resize_copy(PyObject *unicode, Py_ssize_t length)
1195{
1196 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001197 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001198 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001200 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201
1202 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1203 if (copy == NULL)
1204 return NULL;
1205
1206 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001207 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001209 }
1210 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001211 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001212
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001213 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001214 if (w == NULL)
1215 return NULL;
1216 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1217 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001218 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001219 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001220 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001221 }
1222}
1223
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001225 Ux0000 terminated; some code (e.g. new_identifier)
1226 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227
1228 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001229 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230
1231*/
1232
Alexander Belopolsky40018472011-02-26 01:02:56 +00001233static PyUnicodeObject *
1234_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001236 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238
Thomas Wouters477c8d52006-05-27 19:21:47 +00001239 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001240 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001241 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 }
1243
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001244 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001245 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001246 return (PyUnicodeObject *)PyErr_NoMemory();
1247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248 if (length < 0) {
1249 PyErr_SetString(PyExc_SystemError,
1250 "Negative size passed to _PyUnicode_New");
1251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001252 }
1253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001254 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1255 if (unicode == NULL)
1256 return NULL;
1257 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001258
1259 _PyUnicode_WSTR_LENGTH(unicode) = length;
1260 _PyUnicode_HASH(unicode) = -1;
1261 _PyUnicode_STATE(unicode).interned = 0;
1262 _PyUnicode_STATE(unicode).kind = 0;
1263 _PyUnicode_STATE(unicode).compact = 0;
1264 _PyUnicode_STATE(unicode).ready = 0;
1265 _PyUnicode_STATE(unicode).ascii = 0;
1266 _PyUnicode_DATA_ANY(unicode) = NULL;
1267 _PyUnicode_LENGTH(unicode) = 0;
1268 _PyUnicode_UTF8(unicode) = NULL;
1269 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1272 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001273 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001274 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001275 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001277
Jeremy Hyltond8082792003-09-16 19:41:39 +00001278 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001279 * the caller fails before initializing str -- unicode_resize()
1280 * reads str[0], and the Keep-Alive optimization can keep memory
1281 * allocated for str alive across a call to unicode_dealloc(unicode).
1282 * We don't want unicode_resize to read uninitialized memory in
1283 * that case.
1284 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285 _PyUnicode_WSTR(unicode)[0] = 0;
1286 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001287
Victor Stinner7931d9a2011-11-04 00:22:48 +01001288 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289 return unicode;
1290}
1291
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292static const char*
1293unicode_kind_name(PyObject *unicode)
1294{
Victor Stinner42dfd712011-10-03 14:41:45 +02001295 /* don't check consistency: unicode_kind_name() is called from
1296 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 if (!PyUnicode_IS_COMPACT(unicode))
1298 {
1299 if (!PyUnicode_IS_READY(unicode))
1300 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001301 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001302 {
1303 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001304 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 return "legacy ascii";
1306 else
1307 return "legacy latin1";
1308 case PyUnicode_2BYTE_KIND:
1309 return "legacy UCS2";
1310 case PyUnicode_4BYTE_KIND:
1311 return "legacy UCS4";
1312 default:
1313 return "<legacy invalid kind>";
1314 }
1315 }
1316 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001317 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001318 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001319 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001320 return "ascii";
1321 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001322 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001323 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001324 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001325 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001326 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001327 default:
1328 return "<invalid compact kind>";
1329 }
1330}
1331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001334const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001335 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001336 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337}
1338
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001339const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001340 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 return _PyUnicode_COMPACT_DATA(unicode);
1342}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001343const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001344 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001345 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1347 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1348 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1349 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1350 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1351 return PyUnicode_DATA(unicode);
1352}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001353
1354void
1355_PyUnicode_Dump(PyObject *op)
1356{
1357 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001358 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1359 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001360 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001361
Victor Stinnera849a4b2011-10-03 12:12:11 +02001362 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001363 {
1364 if (ascii->state.ascii)
1365 data = (ascii + 1);
1366 else
1367 data = (compact + 1);
1368 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001369 else
1370 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001371 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001372
Victor Stinnera849a4b2011-10-03 12:12:11 +02001373 if (ascii->wstr == data)
1374 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001375 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001376
Victor Stinnera3b334d2011-10-03 13:53:37 +02001377 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001378 printf(" (%zu), ", compact->wstr_length);
1379 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001380 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001381 }
1382 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001383 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001384 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001385}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386#endif
1387
1388PyObject *
1389PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1390{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001391 /* Optimization for empty strings */
1392 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001393 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001394 }
1395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 PyObject *obj;
1397 PyCompactUnicodeObject *unicode;
1398 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001399 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001400 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 Py_ssize_t char_size;
1402 Py_ssize_t struct_size;
1403
Victor Stinner9e9d6892011-10-04 01:02:02 +02001404 is_ascii = 0;
1405 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 struct_size = sizeof(PyCompactUnicodeObject);
1407 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001408 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 char_size = 1;
1410 is_ascii = 1;
1411 struct_size = sizeof(PyASCIIObject);
1412 }
1413 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001414 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 char_size = 1;
1416 }
1417 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001418 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 char_size = 2;
1420 if (sizeof(wchar_t) == 2)
1421 is_sharing = 1;
1422 }
1423 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001424 if (maxchar > MAX_UNICODE) {
1425 PyErr_SetString(PyExc_SystemError,
1426 "invalid maximum character passed to PyUnicode_New");
1427 return NULL;
1428 }
Victor Stinner8f825062012-04-27 13:55:39 +02001429 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 char_size = 4;
1431 if (sizeof(wchar_t) == 4)
1432 is_sharing = 1;
1433 }
1434
1435 /* Ensure we won't overflow the size. */
1436 if (size < 0) {
1437 PyErr_SetString(PyExc_SystemError,
1438 "Negative size passed to PyUnicode_New");
1439 return NULL;
1440 }
1441 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1442 return PyErr_NoMemory();
1443
1444 /* Duplicated allocation code from _PyObject_New() instead of a call to
1445 * PyObject_New() so we are able to allocate space for the object and
1446 * it's data buffer.
1447 */
1448 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001449 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001451 }
1452 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453
1454 unicode = (PyCompactUnicodeObject *)obj;
1455 if (is_ascii)
1456 data = ((PyASCIIObject*)obj) + 1;
1457 else
1458 data = unicode + 1;
1459 _PyUnicode_LENGTH(unicode) = size;
1460 _PyUnicode_HASH(unicode) = -1;
1461 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001462 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 _PyUnicode_STATE(unicode).compact = 1;
1464 _PyUnicode_STATE(unicode).ready = 1;
1465 _PyUnicode_STATE(unicode).ascii = is_ascii;
1466 if (is_ascii) {
1467 ((char*)data)[size] = 0;
1468 _PyUnicode_WSTR(unicode) = NULL;
1469 }
Victor Stinner8f825062012-04-27 13:55:39 +02001470 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 ((char*)data)[size] = 0;
1472 _PyUnicode_WSTR(unicode) = NULL;
1473 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001475 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 else {
1478 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001479 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001480 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001482 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 ((Py_UCS4*)data)[size] = 0;
1484 if (is_sharing) {
1485 _PyUnicode_WSTR_LENGTH(unicode) = size;
1486 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1487 }
1488 else {
1489 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1490 _PyUnicode_WSTR(unicode) = NULL;
1491 }
1492 }
Victor Stinner8f825062012-04-27 13:55:39 +02001493#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001494 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001495#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001496 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 return obj;
1498}
1499
1500#if SIZEOF_WCHAR_T == 2
1501/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1502 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001503 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
1505 This function assumes that unicode can hold one more code point than wstr
1506 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001507static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001508unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001509 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510{
1511 const wchar_t *iter;
1512 Py_UCS4 *ucs4_out;
1513
Victor Stinner910337b2011-10-03 03:20:16 +02001514 assert(unicode != NULL);
1515 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1517 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1518
1519 for (iter = begin; iter < end; ) {
1520 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1521 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001522 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1523 && (iter+1) < end
1524 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 {
Victor Stinner551ac952011-11-29 22:58:13 +01001526 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 iter += 2;
1528 }
1529 else {
1530 *ucs4_out++ = *iter;
1531 iter++;
1532 }
1533 }
1534 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1535 _PyUnicode_GET_LENGTH(unicode)));
1536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537}
1538#endif
1539
Victor Stinnercd9950f2011-10-02 00:34:53 +02001540static int
Victor Stinner488fa492011-12-12 00:01:39 +01001541unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001542{
Victor Stinner488fa492011-12-12 00:01:39 +01001543 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001544 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001545 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001546 return -1;
1547 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001548 return 0;
1549}
1550
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001551static int
1552_copy_characters(PyObject *to, Py_ssize_t to_start,
1553 PyObject *from, Py_ssize_t from_start,
1554 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001556 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001557 const void *from_data;
1558 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559
Victor Stinneree4544c2012-05-09 22:24:08 +02001560 assert(0 <= how_many);
1561 assert(0 <= from_start);
1562 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001564 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001565 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinnerd3f08822012-05-29 12:57:52 +02001567 assert(PyUnicode_Check(to));
1568 assert(PyUnicode_IS_READY(to));
1569 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1570
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001571 if (how_many == 0)
1572 return 0;
1573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001575 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001577 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578
Victor Stinnerf1852262012-06-16 16:38:26 +02001579#ifdef Py_DEBUG
1580 if (!check_maxchar
1581 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1582 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001583 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001584 Py_UCS4 ch;
1585 Py_ssize_t i;
1586 for (i=0; i < how_many; i++) {
1587 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1588 assert(ch <= to_maxchar);
1589 }
1590 }
1591#endif
1592
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001593 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001594 if (check_maxchar
1595 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1596 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001597 /* Writing Latin-1 characters into an ASCII string requires to
1598 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001599 Py_UCS4 max_char;
1600 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001601 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001602 if (max_char >= 128)
1603 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 }
Christian Heimesf051e432016-09-13 20:22:02 +02001605 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001606 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001607 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001609 else if (from_kind == PyUnicode_1BYTE_KIND
1610 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001611 {
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS1, Py_UCS2,
1614 PyUnicode_1BYTE_DATA(from) + from_start,
1615 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1616 PyUnicode_2BYTE_DATA(to) + to_start
1617 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001619 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001620 && to_kind == PyUnicode_4BYTE_KIND)
1621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS1, Py_UCS4,
1624 PyUnicode_1BYTE_DATA(from) + from_start,
1625 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_4BYTE_DATA(to) + to_start
1627 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001628 }
1629 else if (from_kind == PyUnicode_2BYTE_KIND
1630 && to_kind == PyUnicode_4BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS2, Py_UCS4,
1634 PyUnicode_2BYTE_DATA(from) + from_start,
1635 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_4BYTE_DATA(to) + to_start
1637 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001638 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001639 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001640 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1641
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001642 if (!check_maxchar) {
1643 if (from_kind == PyUnicode_2BYTE_KIND
1644 && to_kind == PyUnicode_1BYTE_KIND)
1645 {
1646 _PyUnicode_CONVERT_BYTES(
1647 Py_UCS2, Py_UCS1,
1648 PyUnicode_2BYTE_DATA(from) + from_start,
1649 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1650 PyUnicode_1BYTE_DATA(to) + to_start
1651 );
1652 }
1653 else if (from_kind == PyUnicode_4BYTE_KIND
1654 && to_kind == PyUnicode_1BYTE_KIND)
1655 {
1656 _PyUnicode_CONVERT_BYTES(
1657 Py_UCS4, Py_UCS1,
1658 PyUnicode_4BYTE_DATA(from) + from_start,
1659 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1660 PyUnicode_1BYTE_DATA(to) + to_start
1661 );
1662 }
1663 else if (from_kind == PyUnicode_4BYTE_KIND
1664 && to_kind == PyUnicode_2BYTE_KIND)
1665 {
1666 _PyUnicode_CONVERT_BYTES(
1667 Py_UCS4, Py_UCS2,
1668 PyUnicode_4BYTE_DATA(from) + from_start,
1669 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1670 PyUnicode_2BYTE_DATA(to) + to_start
1671 );
1672 }
1673 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001674 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001675 }
1676 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001677 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001678 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001679 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001680 Py_ssize_t i;
1681
Victor Stinnera0702ab2011-09-29 14:14:38 +02001682 for (i=0; i < how_many; i++) {
1683 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001684 if (ch > to_maxchar)
1685 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001686 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1687 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001688 }
1689 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001690 return 0;
1691}
1692
Victor Stinnerd3f08822012-05-29 12:57:52 +02001693void
1694_PyUnicode_FastCopyCharacters(
1695 PyObject *to, Py_ssize_t to_start,
1696 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697{
1698 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1699}
1700
1701Py_ssize_t
1702PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start,
1704 Py_ssize_t how_many)
1705{
1706 int err;
1707
1708 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1709 PyErr_BadInternalCall();
1710 return -1;
1711 }
1712
Benjamin Petersonbac79492012-01-14 13:34:47 -05001713 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001714 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001715 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001716 return -1;
1717
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001718 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001719 PyErr_SetString(PyExc_IndexError, "string index out of range");
1720 return -1;
1721 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001722 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001723 PyErr_SetString(PyExc_IndexError, "string index out of range");
1724 return -1;
1725 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001726 if (how_many < 0) {
1727 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1728 return -1;
1729 }
1730 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001731 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1732 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001733 "Cannot write %zi characters at %zi "
1734 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001735 how_many, to_start, PyUnicode_GET_LENGTH(to));
1736 return -1;
1737 }
1738
1739 if (how_many == 0)
1740 return 0;
1741
Victor Stinner488fa492011-12-12 00:01:39 +01001742 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001743 return -1;
1744
1745 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1746 if (err) {
1747 PyErr_Format(PyExc_SystemError,
1748 "Cannot copy %s characters "
1749 "into a string of %s characters",
1750 unicode_kind_name(from),
1751 unicode_kind_name(to));
1752 return -1;
1753 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001754 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755}
1756
Victor Stinner17222162011-09-28 22:15:37 +02001757/* Find the maximum code point and count the number of surrogate pairs so a
1758 correct string length can be computed before converting a string to UCS4.
1759 This function counts single surrogates as a character and not as a pair.
1760
1761 Return 0 on success, or -1 on error. */
1762static int
1763find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1764 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765{
1766 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001767 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768
Victor Stinnerc53be962011-10-02 21:33:54 +02001769 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 *num_surrogates = 0;
1771 *maxchar = 0;
1772
1773 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001775 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1776 && (iter+1) < end
1777 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1778 {
1779 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1780 ++(*num_surrogates);
1781 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 }
1783 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001785 {
1786 ch = *iter;
1787 iter++;
1788 }
1789 if (ch > *maxchar) {
1790 *maxchar = ch;
1791 if (*maxchar > MAX_UNICODE) {
1792 PyErr_Format(PyExc_ValueError,
1793 "character U+%x is not in range [U+0000; U+10ffff]",
1794 ch);
1795 return -1;
1796 }
1797 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 }
1799 return 0;
1800}
1801
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001802int
1803_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804{
1805 wchar_t *end;
1806 Py_UCS4 maxchar = 0;
1807 Py_ssize_t num_surrogates;
1808#if SIZEOF_WCHAR_T == 2
1809 Py_ssize_t length_wo_surrogates;
1810#endif
1811
Georg Brandl7597add2011-10-05 16:36:47 +02001812 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001813 strings were created using _PyObject_New() and where no canonical
1814 representation (the str field) has been set yet aka strings
1815 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001816 assert(_PyUnicode_CHECK(unicode));
1817 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001819 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001820 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001821 /* Actually, it should neither be interned nor be anything else: */
1822 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001825 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001826 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828
1829 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001830 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1831 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 PyErr_NoMemory();
1833 return -1;
1834 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001835 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 _PyUnicode_WSTR(unicode), end,
1837 PyUnicode_1BYTE_DATA(unicode));
1838 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1839 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1840 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1841 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001842 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001843 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001844 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 }
1846 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001847 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001848 _PyUnicode_UTF8(unicode) = NULL;
1849 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850 }
1851 PyObject_FREE(_PyUnicode_WSTR(unicode));
1852 _PyUnicode_WSTR(unicode) = NULL;
1853 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1854 }
1855 /* In this case we might have to convert down from 4-byte native
1856 wchar_t to 2-byte unicode. */
1857 else if (maxchar < 65536) {
1858 assert(num_surrogates == 0 &&
1859 "FindMaxCharAndNumSurrogatePairs() messed up");
1860
Victor Stinner506f5922011-09-28 22:34:18 +02001861#if SIZEOF_WCHAR_T == 2
1862 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001863 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001864 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1865 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1866 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001867 _PyUnicode_UTF8(unicode) = NULL;
1868 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001869#else
1870 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001871 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001872 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001873 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001874 PyErr_NoMemory();
1875 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 }
Victor Stinner506f5922011-09-28 22:34:18 +02001877 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1878 _PyUnicode_WSTR(unicode), end,
1879 PyUnicode_2BYTE_DATA(unicode));
1880 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1881 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1882 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001883 _PyUnicode_UTF8(unicode) = NULL;
1884 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001885 PyObject_FREE(_PyUnicode_WSTR(unicode));
1886 _PyUnicode_WSTR(unicode) = NULL;
1887 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1888#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 }
1890 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1891 else {
1892#if SIZEOF_WCHAR_T == 2
1893 /* in case the native representation is 2-bytes, we need to allocate a
1894 new normalized 4-byte version. */
1895 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001896 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1897 PyErr_NoMemory();
1898 return -1;
1899 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001900 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1901 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 PyErr_NoMemory();
1903 return -1;
1904 }
1905 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1906 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001907 _PyUnicode_UTF8(unicode) = NULL;
1908 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001909 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1910 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001911 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 PyObject_FREE(_PyUnicode_WSTR(unicode));
1913 _PyUnicode_WSTR(unicode) = NULL;
1914 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1915#else
1916 assert(num_surrogates == 0);
1917
Victor Stinnerc3c74152011-10-02 20:39:55 +02001918 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001920 _PyUnicode_UTF8(unicode) = NULL;
1921 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1923#endif
1924 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1925 }
1926 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001927 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001928 return 0;
1929}
1930
Alexander Belopolsky40018472011-02-26 01:02:56 +00001931static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001932unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001933{
Walter Dörwald16807132007-05-25 13:52:07 +00001934 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001935 case SSTATE_NOT_INTERNED:
1936 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001937
Benjamin Peterson29060642009-01-31 22:14:21 +00001938 case SSTATE_INTERNED_MORTAL:
1939 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001940 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001941#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001942 if (PyDict_DelItem(interned, unicode) != 0) {
1943 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1944 NULL);
1945 }
Victor Stinner607b1022020-05-05 18:50:30 +02001946#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001947 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001948
Benjamin Peterson29060642009-01-31 22:14:21 +00001949 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001950 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1951 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001952
Benjamin Peterson29060642009-01-31 22:14:21 +00001953 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001954 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001955 }
1956
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001959 }
1960 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001961 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001962 }
1963 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001964 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001967 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968}
1969
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001970#ifdef Py_DEBUG
1971static int
1972unicode_is_singleton(PyObject *unicode)
1973{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001974 struct _Py_unicode_state *state = get_unicode_state();
1975 if (unicode == state->empty) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001976 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001977 }
Victor Stinner607b1022020-05-05 18:50:30 +02001978 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001979 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1980 {
1981 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001982 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001983 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001984 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001985 }
1986 return 0;
1987}
1988#endif
1989
Alexander Belopolsky40018472011-02-26 01:02:56 +00001990static int
Victor Stinner488fa492011-12-12 00:01:39 +01001991unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001992{
Victor Stinner488fa492011-12-12 00:01:39 +01001993 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001994 if (Py_REFCNT(unicode) != 1)
1995 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001996 if (_PyUnicode_HASH(unicode) != -1)
1997 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001998 if (PyUnicode_CHECK_INTERNED(unicode))
1999 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002000 if (!PyUnicode_CheckExact(unicode))
2001 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002002#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002003 /* singleton refcount is greater than 1 */
2004 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002005#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002006 return 1;
2007}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002008
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009static int
2010unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2011{
2012 PyObject *unicode;
2013 Py_ssize_t old_length;
2014
2015 assert(p_unicode != NULL);
2016 unicode = *p_unicode;
2017
2018 assert(unicode != NULL);
2019 assert(PyUnicode_Check(unicode));
2020 assert(0 <= length);
2021
Victor Stinner910337b2011-10-03 03:20:16 +02002022 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002023 old_length = PyUnicode_WSTR_LENGTH(unicode);
2024 else
2025 old_length = PyUnicode_GET_LENGTH(unicode);
2026 if (old_length == length)
2027 return 0;
2028
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002029 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002030 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002031 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002032 return 0;
2033 }
2034
Victor Stinner488fa492011-12-12 00:01:39 +01002035 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002036 PyObject *copy = resize_copy(unicode, length);
2037 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002038 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002039 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002041 }
2042
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002044 PyObject *new_unicode = resize_compact(unicode, length);
2045 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002046 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002047 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002048 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002049 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002050 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002051}
2052
Alexander Belopolsky40018472011-02-26 01:02:56 +00002053int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002054PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002055{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002056 PyObject *unicode;
2057 if (p_unicode == NULL) {
2058 PyErr_BadInternalCall();
2059 return -1;
2060 }
2061 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002062 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 {
2064 PyErr_BadInternalCall();
2065 return -1;
2066 }
2067 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002068}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002069
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002070/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002071
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002072 WARNING: The function doesn't copy the terminating null character and
2073 doesn't check the maximum character (may write a latin1 character in an
2074 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002075static void
2076unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2077 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002078{
2079 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002080 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002081 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002082
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002083 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002084 switch (kind) {
2085 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002086#ifdef Py_DEBUG
2087 if (PyUnicode_IS_ASCII(unicode)) {
2088 Py_UCS4 maxchar = ucs1lib_find_max_char(
2089 (const Py_UCS1*)str,
2090 (const Py_UCS1*)str + len);
2091 assert(maxchar < 128);
2092 }
2093#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002094 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002095 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002096 }
2097 case PyUnicode_2BYTE_KIND: {
2098 Py_UCS2 *start = (Py_UCS2 *)data + index;
2099 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002100
Victor Stinner184252a2012-06-16 02:57:41 +02002101 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002102 *ucs2 = (Py_UCS2)*str;
2103
2104 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002105 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002106 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002107 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002108 Py_UCS4 *start = (Py_UCS4 *)data + index;
2109 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002110
Victor Stinner184252a2012-06-16 02:57:41 +02002111 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002112 *ucs4 = (Py_UCS4)*str;
2113
2114 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002115 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002116 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002117 default:
2118 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 }
2120}
2121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002123get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002124{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002125 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002126
Victor Stinner2f9ada92020-06-24 02:22:21 +02002127 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002128 if (unicode) {
2129 Py_INCREF(unicode);
2130 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 }
Victor Stinner607b1022020-05-05 18:50:30 +02002132
2133 unicode = PyUnicode_New(1, ch);
2134 if (!unicode) {
2135 return NULL;
2136 }
2137
2138 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2139 assert(_PyUnicode_CheckConsistency(unicode, 1));
2140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002142 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002143 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144}
2145
Victor Stinner985a82a2014-01-03 12:53:47 +01002146static PyObject*
2147unicode_char(Py_UCS4 ch)
2148{
2149 PyObject *unicode;
2150
2151 assert(ch <= MAX_UNICODE);
2152
Victor Stinner2f9ada92020-06-24 02:22:21 +02002153 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002154 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002155 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002156
Victor Stinner985a82a2014-01-03 12:53:47 +01002157 unicode = PyUnicode_New(1, ch);
2158 if (unicode == NULL)
2159 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002160
2161 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2162 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002163 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002164 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002165 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2166 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2167 }
2168 assert(_PyUnicode_CheckConsistency(unicode, 1));
2169 return unicode;
2170}
2171
Alexander Belopolsky40018472011-02-26 01:02:56 +00002172PyObject *
2173PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002175 if (u == NULL)
2176 return (PyObject*)_PyUnicode_New(size);
2177
2178 if (size < 0) {
2179 PyErr_BadInternalCall();
2180 return NULL;
2181 }
2182
2183 return PyUnicode_FromWideChar(u, size);
2184}
2185
2186PyObject *
2187PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2188{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002189 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 Py_UCS4 maxchar = 0;
2191 Py_ssize_t num_surrogates;
2192
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002193 if (u == NULL && size != 0) {
2194 PyErr_BadInternalCall();
2195 return NULL;
2196 }
2197
2198 if (size == -1) {
2199 size = wcslen(u);
2200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002202 /* If the Unicode data is known at construction time, we can apply
2203 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002206 if (size == 0)
2207 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 /* Single character Unicode objects in the Latin-1 range are
2210 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002211 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 return get_latin1_char((unsigned char)*u);
2213
2214 /* If not empty and not single character, copy the Unicode data
2215 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002216 if (find_maxchar_surrogates(u, u + size,
2217 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return NULL;
2219
Victor Stinner8faf8212011-12-08 22:14:11 +01002220 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 if (!unicode)
2222 return NULL;
2223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 switch (PyUnicode_KIND(unicode)) {
2225 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002226 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2228 break;
2229 case PyUnicode_2BYTE_KIND:
2230#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002231 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002233 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2235#endif
2236 break;
2237 case PyUnicode_4BYTE_KIND:
2238#if SIZEOF_WCHAR_T == 2
2239 /* This is the only case which has to process surrogates, thus
2240 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002241 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242#else
2243 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002244 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245#endif
2246 break;
2247 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002248 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002252}
2253
Alexander Belopolsky40018472011-02-26 01:02:56 +00002254PyObject *
2255PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002256{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002257 if (size < 0) {
2258 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002259 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 return NULL;
2261 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002262 if (u != NULL)
2263 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2264 else
2265 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002266}
2267
Alexander Belopolsky40018472011-02-26 01:02:56 +00002268PyObject *
2269PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002270{
2271 size_t size = strlen(u);
2272 if (size > PY_SSIZE_T_MAX) {
2273 PyErr_SetString(PyExc_OverflowError, "input too long");
2274 return NULL;
2275 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002276 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002277}
2278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002279PyObject *
2280_PyUnicode_FromId(_Py_Identifier *id)
2281{
Victor Stinner297257f2020-06-02 14:39:45 +02002282 if (id->object) {
2283 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002284 }
Victor Stinner297257f2020-06-02 14:39:45 +02002285
2286 PyObject *obj;
2287 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2288 strlen(id->string),
2289 NULL, NULL);
2290 if (!obj) {
2291 return NULL;
2292 }
2293 PyUnicode_InternInPlace(&obj);
2294
2295 assert(!id->next);
2296 id->object = obj;
2297 id->next = static_strings;
2298 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002299 return id->object;
2300}
2301
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002302static void
2303unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002304{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002305 _Py_Identifier *tmp, *s = static_strings;
2306 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002307 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002308 tmp = s->next;
2309 s->next = NULL;
2310 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002311 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002312 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002313}
2314
Benjamin Peterson0df54292012-03-26 14:50:32 -04002315/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002316
Victor Stinnerd3f08822012-05-29 12:57:52 +02002317PyObject*
2318_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002319{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002320 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002321 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002322 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002323#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002324 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002325#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002326 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002327 }
Victor Stinner785938e2011-12-11 20:09:03 +01002328 unicode = PyUnicode_New(size, 127);
2329 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002330 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002331 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2332 assert(_PyUnicode_CheckConsistency(unicode, 1));
2333 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002334}
2335
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002336static Py_UCS4
2337kind_maxchar_limit(unsigned int kind)
2338{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002339 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002340 case PyUnicode_1BYTE_KIND:
2341 return 0x80;
2342 case PyUnicode_2BYTE_KIND:
2343 return 0x100;
2344 case PyUnicode_4BYTE_KIND:
2345 return 0x10000;
2346 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002347 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002348 }
2349}
2350
Victor Stinner702c7342011-10-05 13:50:52 +02002351static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002352_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002355 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002356
Victor Stinner2f9ada92020-06-24 02:22:21 +02002357 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002358 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002359 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002360 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002361 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002362 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002363 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002364
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002366 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 if (!res)
2368 return NULL;
2369 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002370 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002372}
2373
Victor Stinnere57b1c02011-09-28 22:20:48 +02002374static PyObject*
2375_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376{
2377 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002378 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002379
Serhiy Storchaka678db842013-01-26 12:16:36 +02002380 if (size == 0)
2381 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002382 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002383 if (size == 1)
2384 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002385
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002386 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002387 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 if (!res)
2389 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002390 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002392 else {
2393 _PyUnicode_CONVERT_BYTES(
2394 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2395 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002396 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 return res;
2398}
2399
Victor Stinnere57b1c02011-09-28 22:20:48 +02002400static PyObject*
2401_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402{
2403 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002404 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002405
Serhiy Storchaka678db842013-01-26 12:16:36 +02002406 if (size == 0)
2407 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002408 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002409 if (size == 1)
2410 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002411
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002412 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002413 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 if (!res)
2415 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002416 if (max_char < 256)
2417 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2418 PyUnicode_1BYTE_DATA(res));
2419 else if (max_char < 0x10000)
2420 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2421 PyUnicode_2BYTE_DATA(res));
2422 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002424 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 return res;
2426}
2427
2428PyObject*
2429PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2430{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002431 if (size < 0) {
2432 PyErr_SetString(PyExc_ValueError, "size must be positive");
2433 return NULL;
2434 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002435 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002437 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002439 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002441 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002442 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002443 PyErr_SetString(PyExc_SystemError, "invalid kind");
2444 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446}
2447
Victor Stinnerece58de2012-04-23 23:36:38 +02002448Py_UCS4
2449_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2450{
2451 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002452 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002453
2454 assert(PyUnicode_IS_READY(unicode));
2455 assert(0 <= start);
2456 assert(end <= PyUnicode_GET_LENGTH(unicode));
2457 assert(start <= end);
2458
2459 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2460 return PyUnicode_MAX_CHAR_VALUE(unicode);
2461
2462 if (start == end)
2463 return 127;
2464
Victor Stinner94d558b2012-04-27 22:26:58 +02002465 if (PyUnicode_IS_ASCII(unicode))
2466 return 127;
2467
Victor Stinnerece58de2012-04-23 23:36:38 +02002468 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002469 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002470 endptr = (char *)startptr + end * kind;
2471 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002472 switch(kind) {
2473 case PyUnicode_1BYTE_KIND:
2474 return ucs1lib_find_max_char(startptr, endptr);
2475 case PyUnicode_2BYTE_KIND:
2476 return ucs2lib_find_max_char(startptr, endptr);
2477 case PyUnicode_4BYTE_KIND:
2478 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002479 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002480 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002481 }
2482}
2483
Victor Stinner25a4b292011-10-06 12:31:55 +02002484/* Ensure that a string uses the most efficient storage, if it is not the
2485 case: create a new string with of the right kind. Write NULL into *p_unicode
2486 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002487static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002488unicode_adjust_maxchar(PyObject **p_unicode)
2489{
2490 PyObject *unicode, *copy;
2491 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002492 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002493 unsigned int kind;
2494
2495 assert(p_unicode != NULL);
2496 unicode = *p_unicode;
2497 assert(PyUnicode_IS_READY(unicode));
2498 if (PyUnicode_IS_ASCII(unicode))
2499 return;
2500
2501 len = PyUnicode_GET_LENGTH(unicode);
2502 kind = PyUnicode_KIND(unicode);
2503 if (kind == PyUnicode_1BYTE_KIND) {
2504 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002505 max_char = ucs1lib_find_max_char(u, u + len);
2506 if (max_char >= 128)
2507 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002508 }
2509 else if (kind == PyUnicode_2BYTE_KIND) {
2510 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002511 max_char = ucs2lib_find_max_char(u, u + len);
2512 if (max_char >= 256)
2513 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002514 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002515 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002516 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002517 max_char = ucs4lib_find_max_char(u, u + len);
2518 if (max_char >= 0x10000)
2519 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002520 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002521 else
2522 Py_UNREACHABLE();
2523
Victor Stinner25a4b292011-10-06 12:31:55 +02002524 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002525 if (copy != NULL)
2526 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002527 Py_DECREF(unicode);
2528 *p_unicode = copy;
2529}
2530
Victor Stinner034f6cf2011-09-30 02:26:44 +02002531PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002532_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002533{
Victor Stinner87af4f22011-11-21 23:03:47 +01002534 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002535 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002536
Victor Stinner034f6cf2011-09-30 02:26:44 +02002537 if (!PyUnicode_Check(unicode)) {
2538 PyErr_BadInternalCall();
2539 return NULL;
2540 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002541 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002542 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002543
Victor Stinner87af4f22011-11-21 23:03:47 +01002544 length = PyUnicode_GET_LENGTH(unicode);
2545 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002546 if (!copy)
2547 return NULL;
2548 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2549
Christian Heimesf051e432016-09-13 20:22:02 +02002550 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002551 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002552 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002553 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002554}
2555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556
Victor Stinnerbc603d12011-10-02 01:00:40 +02002557/* Widen Unicode objects to larger buffers. Don't write terminating null
2558 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002560static void*
2561unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002563 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002564
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002565 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002566 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002567 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002568 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002569 if (!result)
2570 return PyErr_NoMemory();
2571 assert(skind == PyUnicode_1BYTE_KIND);
2572 _PyUnicode_CONVERT_BYTES(
2573 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002574 (const Py_UCS1 *)data,
2575 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002576 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002578 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002579 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002580 if (!result)
2581 return PyErr_NoMemory();
2582 if (skind == PyUnicode_2BYTE_KIND) {
2583 _PyUnicode_CONVERT_BYTES(
2584 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002585 (const Py_UCS2 *)data,
2586 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002587 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002589 else {
2590 assert(skind == PyUnicode_1BYTE_KIND);
2591 _PyUnicode_CONVERT_BYTES(
2592 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002593 (const Py_UCS1 *)data,
2594 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002595 result);
2596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002598 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002599 Py_UNREACHABLE();
2600 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602}
2603
2604static Py_UCS4*
2605as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2606 int copy_null)
2607{
2608 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002609 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 Py_ssize_t len, targetlen;
2611 if (PyUnicode_READY(string) == -1)
2612 return NULL;
2613 kind = PyUnicode_KIND(string);
2614 data = PyUnicode_DATA(string);
2615 len = PyUnicode_GET_LENGTH(string);
2616 targetlen = len;
2617 if (copy_null)
2618 targetlen++;
2619 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002620 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 if (!target) {
2622 PyErr_NoMemory();
2623 return NULL;
2624 }
2625 }
2626 else {
2627 if (targetsize < targetlen) {
2628 PyErr_Format(PyExc_SystemError,
2629 "string is longer than the buffer");
2630 if (copy_null && 0 < targetsize)
2631 target[0] = 0;
2632 return NULL;
2633 }
2634 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002635 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002636 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002637 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002639 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002640 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002641 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2642 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002643 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002644 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002645 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002646 else {
2647 Py_UNREACHABLE();
2648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 if (copy_null)
2650 target[len] = 0;
2651 return target;
2652}
2653
2654Py_UCS4*
2655PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2656 int copy_null)
2657{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002658 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 PyErr_BadInternalCall();
2660 return NULL;
2661 }
2662 return as_ucs4(string, target, targetsize, copy_null);
2663}
2664
2665Py_UCS4*
2666PyUnicode_AsUCS4Copy(PyObject *string)
2667{
2668 return as_ucs4(string, NULL, 0, 1);
2669}
2670
Victor Stinner15a11362012-10-06 23:48:20 +02002671/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002672 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2673 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2674#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002675
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676static int
2677unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2678 Py_ssize_t width, Py_ssize_t precision)
2679{
2680 Py_ssize_t length, fill, arglen;
2681 Py_UCS4 maxchar;
2682
2683 if (PyUnicode_READY(str) == -1)
2684 return -1;
2685
2686 length = PyUnicode_GET_LENGTH(str);
2687 if ((precision == -1 || precision >= length)
2688 && width <= length)
2689 return _PyUnicodeWriter_WriteStr(writer, str);
2690
2691 if (precision != -1)
2692 length = Py_MIN(precision, length);
2693
2694 arglen = Py_MAX(length, width);
2695 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2696 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2697 else
2698 maxchar = writer->maxchar;
2699
2700 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2701 return -1;
2702
2703 if (width > length) {
2704 fill = width - length;
2705 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2706 return -1;
2707 writer->pos += fill;
2708 }
2709
2710 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2711 str, 0, length);
2712 writer->pos += length;
2713 return 0;
2714}
2715
2716static int
Victor Stinner998b8062018-09-12 00:23:25 +02002717unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002718 Py_ssize_t width, Py_ssize_t precision)
2719{
2720 /* UTF-8 */
2721 Py_ssize_t length;
2722 PyObject *unicode;
2723 int res;
2724
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002725 if (precision == -1) {
2726 length = strlen(str);
2727 }
2728 else {
2729 length = 0;
2730 while (length < precision && str[length]) {
2731 length++;
2732 }
2733 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002734 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2735 if (unicode == NULL)
2736 return -1;
2737
2738 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2739 Py_DECREF(unicode);
2740 return res;
2741}
2742
Victor Stinner96865452011-03-01 23:44:09 +00002743static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002744unicode_fromformat_arg(_PyUnicodeWriter *writer,
2745 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002746{
Victor Stinnere215d962012-10-06 23:03:36 +02002747 const char *p;
2748 Py_ssize_t len;
2749 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002750 Py_ssize_t width;
2751 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002752 int longflag;
2753 int longlongflag;
2754 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002755 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002756
2757 p = f;
2758 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002759 zeropad = 0;
2760 if (*f == '0') {
2761 zeropad = 1;
2762 f++;
2763 }
Victor Stinner96865452011-03-01 23:44:09 +00002764
2765 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002766 width = -1;
2767 if (Py_ISDIGIT((unsigned)*f)) {
2768 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002769 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002770 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002771 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002772 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002773 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002774 return NULL;
2775 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002776 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002777 f++;
2778 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 }
2780 precision = -1;
2781 if (*f == '.') {
2782 f++;
2783 if (Py_ISDIGIT((unsigned)*f)) {
2784 precision = (*f - '0');
2785 f++;
2786 while (Py_ISDIGIT((unsigned)*f)) {
2787 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2788 PyErr_SetString(PyExc_ValueError,
2789 "precision too big");
2790 return NULL;
2791 }
2792 precision = (precision * 10) + (*f - '0');
2793 f++;
2794 }
2795 }
Victor Stinner96865452011-03-01 23:44:09 +00002796 if (*f == '%') {
2797 /* "%.3%s" => f points to "3" */
2798 f--;
2799 }
2800 }
2801 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002802 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002803 f--;
2804 }
Victor Stinner96865452011-03-01 23:44:09 +00002805
2806 /* Handle %ld, %lu, %lld and %llu. */
2807 longflag = 0;
2808 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002809 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002810 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002811 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002812 longflag = 1;
2813 ++f;
2814 }
Victor Stinner96865452011-03-01 23:44:09 +00002815 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002816 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002817 longlongflag = 1;
2818 f += 2;
2819 }
Victor Stinner96865452011-03-01 23:44:09 +00002820 }
2821 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002822 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002823 size_tflag = 1;
2824 ++f;
2825 }
Victor Stinnere215d962012-10-06 23:03:36 +02002826
2827 if (f[1] == '\0')
2828 writer->overallocate = 0;
2829
2830 switch (*f) {
2831 case 'c':
2832 {
2833 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002834 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002835 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002836 "character argument not in range(0x110000)");
2837 return NULL;
2838 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002839 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002840 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002841 break;
2842 }
2843
2844 case 'i':
2845 case 'd':
2846 case 'u':
2847 case 'x':
2848 {
2849 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002850 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002851 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002852
2853 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002854 if (longflag) {
2855 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2856 }
2857 else if (longlongflag) {
2858 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2859 }
2860 else if (size_tflag) {
2861 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2862 }
2863 else {
2864 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2865 }
Victor Stinnere215d962012-10-06 23:03:36 +02002866 }
2867 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002868 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002869 }
2870 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002871 if (longflag) {
2872 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2873 }
2874 else if (longlongflag) {
2875 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2876 }
2877 else if (size_tflag) {
2878 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2879 }
2880 else {
2881 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2882 }
Victor Stinnere215d962012-10-06 23:03:36 +02002883 }
2884 assert(len >= 0);
2885
Victor Stinnere215d962012-10-06 23:03:36 +02002886 if (precision < len)
2887 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002888
2889 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002890 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2891 return NULL;
2892
Victor Stinnere215d962012-10-06 23:03:36 +02002893 if (width > precision) {
2894 Py_UCS4 fillchar;
2895 fill = width - precision;
2896 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002897 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2898 return NULL;
2899 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002900 }
Victor Stinner15a11362012-10-06 23:48:20 +02002901 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002902 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002903 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2904 return NULL;
2905 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002906 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002907
Victor Stinner4a587072013-11-19 12:54:53 +01002908 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2909 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002910 break;
2911 }
2912
2913 case 'p':
2914 {
2915 char number[MAX_LONG_LONG_CHARS];
2916
2917 len = sprintf(number, "%p", va_arg(*vargs, void*));
2918 assert(len >= 0);
2919
2920 /* %p is ill-defined: ensure leading 0x. */
2921 if (number[1] == 'X')
2922 number[1] = 'x';
2923 else if (number[1] != 'x') {
2924 memmove(number + 2, number,
2925 strlen(number) + 1);
2926 number[0] = '0';
2927 number[1] = 'x';
2928 len += 2;
2929 }
2930
Victor Stinner4a587072013-11-19 12:54:53 +01002931 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002932 return NULL;
2933 break;
2934 }
2935
2936 case 's':
2937 {
2938 /* UTF-8 */
2939 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002940 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002941 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002942 break;
2943 }
2944
2945 case 'U':
2946 {
2947 PyObject *obj = va_arg(*vargs, PyObject *);
2948 assert(obj && _PyUnicode_CHECK(obj));
2949
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002951 return NULL;
2952 break;
2953 }
2954
2955 case 'V':
2956 {
2957 PyObject *obj = va_arg(*vargs, PyObject *);
2958 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002959 if (obj) {
2960 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002961 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002962 return NULL;
2963 }
2964 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002965 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002966 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002967 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 }
2969 break;
2970 }
2971
2972 case 'S':
2973 {
2974 PyObject *obj = va_arg(*vargs, PyObject *);
2975 PyObject *str;
2976 assert(obj);
2977 str = PyObject_Str(obj);
2978 if (!str)
2979 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002980 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002981 Py_DECREF(str);
2982 return NULL;
2983 }
2984 Py_DECREF(str);
2985 break;
2986 }
2987
2988 case 'R':
2989 {
2990 PyObject *obj = va_arg(*vargs, PyObject *);
2991 PyObject *repr;
2992 assert(obj);
2993 repr = PyObject_Repr(obj);
2994 if (!repr)
2995 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002996 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002997 Py_DECREF(repr);
2998 return NULL;
2999 }
3000 Py_DECREF(repr);
3001 break;
3002 }
3003
3004 case 'A':
3005 {
3006 PyObject *obj = va_arg(*vargs, PyObject *);
3007 PyObject *ascii;
3008 assert(obj);
3009 ascii = PyObject_ASCII(obj);
3010 if (!ascii)
3011 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003012 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003013 Py_DECREF(ascii);
3014 return NULL;
3015 }
3016 Py_DECREF(ascii);
3017 break;
3018 }
3019
3020 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003021 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003022 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003023 break;
3024
3025 default:
3026 /* if we stumble upon an unknown formatting code, copy the rest
3027 of the format string to the output string. (we cannot just
3028 skip the code, since there's no way to know what's in the
3029 argument list) */
3030 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003031 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003032 return NULL;
3033 f = p+len;
3034 return f;
3035 }
3036
3037 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003038 return f;
3039}
3040
Walter Dörwaldd2034312007-05-18 16:29:38 +00003041PyObject *
3042PyUnicode_FromFormatV(const char *format, va_list vargs)
3043{
Victor Stinnere215d962012-10-06 23:03:36 +02003044 va_list vargs2;
3045 const char *f;
3046 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003047
Victor Stinner8f674cc2013-04-17 23:02:17 +02003048 _PyUnicodeWriter_Init(&writer);
3049 writer.min_length = strlen(format) + 100;
3050 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003051
Benjamin Peterson0c212142016-09-20 20:39:33 -07003052 // Copy varags to be able to pass a reference to a subfunction.
3053 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003054
3055 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003056 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003057 f = unicode_fromformat_arg(&writer, f, &vargs2);
3058 if (f == NULL)
3059 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003061 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003062 const char *p;
3063 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003064
Victor Stinnere215d962012-10-06 23:03:36 +02003065 p = f;
3066 do
3067 {
3068 if ((unsigned char)*p > 127) {
3069 PyErr_Format(PyExc_ValueError,
3070 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3071 "string, got a non-ASCII byte: 0x%02x",
3072 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003073 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003074 }
3075 p++;
3076 }
3077 while (*p != '\0' && *p != '%');
3078 len = p - f;
3079
3080 if (*p == '\0')
3081 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003082
3083 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003084 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003085
3086 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003088 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003089 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003090 return _PyUnicodeWriter_Finish(&writer);
3091
3092 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003093 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003094 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003095 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003096}
3097
Walter Dörwaldd2034312007-05-18 16:29:38 +00003098PyObject *
3099PyUnicode_FromFormat(const char *format, ...)
3100{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003101 PyObject* ret;
3102 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003103
3104#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003105 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003106#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003107 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003108#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003109 ret = PyUnicode_FromFormatV(format, vargs);
3110 va_end(vargs);
3111 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003112}
3113
Serhiy Storchakac46db922018-10-23 22:58:24 +03003114static Py_ssize_t
3115unicode_get_widechar_size(PyObject *unicode)
3116{
3117 Py_ssize_t res;
3118
3119 assert(unicode != NULL);
3120 assert(_PyUnicode_CHECK(unicode));
3121
3122 if (_PyUnicode_WSTR(unicode) != NULL) {
3123 return PyUnicode_WSTR_LENGTH(unicode);
3124 }
3125 assert(PyUnicode_IS_READY(unicode));
3126
3127 res = _PyUnicode_LENGTH(unicode);
3128#if SIZEOF_WCHAR_T == 2
3129 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3130 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3131 const Py_UCS4 *end = s + res;
3132 for (; s < end; ++s) {
3133 if (*s > 0xFFFF) {
3134 ++res;
3135 }
3136 }
3137 }
3138#endif
3139 return res;
3140}
3141
3142static void
3143unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3144{
3145 const wchar_t *wstr;
3146
3147 assert(unicode != NULL);
3148 assert(_PyUnicode_CHECK(unicode));
3149
3150 wstr = _PyUnicode_WSTR(unicode);
3151 if (wstr != NULL) {
3152 memcpy(w, wstr, size * sizeof(wchar_t));
3153 return;
3154 }
3155 assert(PyUnicode_IS_READY(unicode));
3156
3157 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3158 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3159 for (; size--; ++s, ++w) {
3160 *w = *s;
3161 }
3162 }
3163 else {
3164#if SIZEOF_WCHAR_T == 4
3165 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3166 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3167 for (; size--; ++s, ++w) {
3168 *w = *s;
3169 }
3170#else
3171 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3172 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3173 for (; size--; ++s, ++w) {
3174 Py_UCS4 ch = *s;
3175 if (ch > 0xFFFF) {
3176 assert(ch <= MAX_UNICODE);
3177 /* encode surrogate pair in this case */
3178 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3179 if (!size--)
3180 break;
3181 *w = Py_UNICODE_LOW_SURROGATE(ch);
3182 }
3183 else {
3184 *w = ch;
3185 }
3186 }
3187#endif
3188 }
3189}
3190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003191#ifdef HAVE_WCHAR_H
3192
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003193/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003194
Victor Stinnerd88d9832011-09-06 02:00:05 +02003195 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003196 character) required to convert the unicode object. Ignore size argument.
3197
Victor Stinnerd88d9832011-09-06 02:00:05 +02003198 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003199 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003200 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003201Py_ssize_t
3202PyUnicode_AsWideChar(PyObject *unicode,
3203 wchar_t *w,
3204 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003205{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003206 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003207
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003208 if (unicode == NULL) {
3209 PyErr_BadInternalCall();
3210 return -1;
3211 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003214 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003215 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003216
3217 res = unicode_get_widechar_size(unicode);
3218 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003219 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003220 }
3221
3222 if (size > res) {
3223 size = res + 1;
3224 }
3225 else {
3226 res = size;
3227 }
3228 unicode_copy_as_widechar(unicode, w, size);
3229 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003230}
3231
Victor Stinner137c34c2010-09-29 10:25:54 +00003232wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003233PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003234 Py_ssize_t *size)
3235{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003236 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003237 Py_ssize_t buflen;
3238
3239 if (unicode == NULL) {
3240 PyErr_BadInternalCall();
3241 return NULL;
3242 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003243 if (!PyUnicode_Check(unicode)) {
3244 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003245 return NULL;
3246 }
3247
Serhiy Storchakac46db922018-10-23 22:58:24 +03003248 buflen = unicode_get_widechar_size(unicode);
3249 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003250 if (buffer == NULL) {
3251 PyErr_NoMemory();
3252 return NULL;
3253 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003254 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3255 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003256 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003257 }
3258 else if (wcslen(buffer) != (size_t)buflen) {
3259 PyMem_FREE(buffer);
3260 PyErr_SetString(PyExc_ValueError,
3261 "embedded null character");
3262 return NULL;
3263 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003264 return buffer;
3265}
3266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003267#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268
Alexander Belopolsky40018472011-02-26 01:02:56 +00003269PyObject *
3270PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003271{
Victor Stinner8faf8212011-12-08 22:14:11 +01003272 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 PyErr_SetString(PyExc_ValueError,
3274 "chr() arg not in range(0x110000)");
3275 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003276 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003277
Victor Stinner985a82a2014-01-03 12:53:47 +01003278 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003279}
3280
Alexander Belopolsky40018472011-02-26 01:02:56 +00003281PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003282PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003283{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003284 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003286 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003287 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003288 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003289 Py_INCREF(obj);
3290 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003291 }
3292 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003293 /* For a Unicode subtype that's not a Unicode object,
3294 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003295 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003296 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003297 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003298 "Can't convert '%.100s' object to str implicitly",
3299 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003300 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003301}
3302
Alexander Belopolsky40018472011-02-26 01:02:56 +00003303PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003304PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003305 const char *encoding,
3306 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003307{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003308 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003309 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003310
Guido van Rossumd57fd912000-03-10 22:53:23 +00003311 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 PyErr_BadInternalCall();
3313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003315
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003316 /* Decoding bytes objects is the most common case and should be fast */
3317 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003318 if (PyBytes_GET_SIZE(obj) == 0) {
3319 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3320 return NULL;
3321 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003322 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003323 }
3324 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003325 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3326 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003327 }
3328
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003329 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 PyErr_SetString(PyExc_TypeError,
3331 "decoding str is not supported");
3332 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003333 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003334
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003335 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3336 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3337 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003338 "decoding to str: need a bytes-like object, %.80s found",
3339 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003340 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003341 }
Tim Petersced69f82003-09-16 20:30:58 +00003342
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003343 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003344 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003345 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3346 return NULL;
3347 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003348 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003350
Serhiy Storchaka05997252013-01-26 12:14:02 +02003351 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003352 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003353 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003354}
3355
Victor Stinnerebe17e02016-10-12 13:57:45 +02003356/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3357 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3358 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003359int
3360_Py_normalize_encoding(const char *encoding,
3361 char *lower,
3362 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003364 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003365 char *l;
3366 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003367 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003368
Victor Stinner942889a2016-09-05 15:40:10 -07003369 assert(encoding != NULL);
3370
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003371 e = encoding;
3372 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003373 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003374 punct = 0;
3375 while (1) {
3376 char c = *e;
3377 if (c == 0) {
3378 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003379 }
Victor Stinner942889a2016-09-05 15:40:10 -07003380
3381 if (Py_ISALNUM(c) || c == '.') {
3382 if (punct && l != lower) {
3383 if (l == l_end) {
3384 return 0;
3385 }
3386 *l++ = '_';
3387 }
3388 punct = 0;
3389
3390 if (l == l_end) {
3391 return 0;
3392 }
3393 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003394 }
3395 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003396 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003397 }
Victor Stinner942889a2016-09-05 15:40:10 -07003398
3399 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003400 }
3401 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003402 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003403}
3404
Alexander Belopolsky40018472011-02-26 01:02:56 +00003405PyObject *
3406PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003407 Py_ssize_t size,
3408 const char *encoding,
3409 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003410{
3411 PyObject *buffer = NULL, *unicode;
3412 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003413 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3414
Victor Stinner22eb6892019-06-26 00:51:05 +02003415 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3416 return NULL;
3417 }
3418
Victor Stinnered076ed2019-06-26 01:49:32 +02003419 if (size == 0) {
3420 _Py_RETURN_UNICODE_EMPTY();
3421 }
3422
Victor Stinner942889a2016-09-05 15:40:10 -07003423 if (encoding == NULL) {
3424 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3425 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003426
Fred Drakee4315f52000-05-09 19:53:39 +00003427 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003428 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3429 char *lower = buflower;
3430
3431 /* Fast paths */
3432 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3433 lower += 3;
3434 if (*lower == '_') {
3435 /* Match "utf8" and "utf_8" */
3436 lower++;
3437 }
3438
3439 if (lower[0] == '8' && lower[1] == 0) {
3440 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3441 }
3442 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3443 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3444 }
3445 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3446 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3447 }
3448 }
3449 else {
3450 if (strcmp(lower, "ascii") == 0
3451 || strcmp(lower, "us_ascii") == 0) {
3452 return PyUnicode_DecodeASCII(s, size, errors);
3453 }
Steve Dowercc16be82016-09-08 10:35:16 -07003454 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003455 else if (strcmp(lower, "mbcs") == 0) {
3456 return PyUnicode_DecodeMBCS(s, size, errors);
3457 }
3458 #endif
3459 else if (strcmp(lower, "latin1") == 0
3460 || strcmp(lower, "latin_1") == 0
3461 || strcmp(lower, "iso_8859_1") == 0
3462 || strcmp(lower, "iso8859_1") == 0) {
3463 return PyUnicode_DecodeLatin1(s, size, errors);
3464 }
3465 }
Victor Stinner37296e82010-06-10 13:36:23 +00003466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467
3468 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003469 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003470 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003471 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003472 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473 if (buffer == NULL)
3474 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003475 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 if (unicode == NULL)
3477 goto onError;
3478 if (!PyUnicode_Check(unicode)) {
3479 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003480 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003481 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003482 encoding,
3483 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 Py_DECREF(unicode);
3485 goto onError;
3486 }
3487 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003488 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003489
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 Py_XDECREF(buffer);
3492 return NULL;
3493}
3494
Alexander Belopolsky40018472011-02-26 01:02:56 +00003495PyObject *
3496PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003497 const char *encoding,
3498 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003499{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003500 if (!PyUnicode_Check(unicode)) {
3501 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003502 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003503 }
3504
Serhiy Storchaka00939072016-10-27 21:05:49 +03003505 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3506 "PyUnicode_AsDecodedObject() is deprecated; "
3507 "use PyCodec_Decode() to decode from str", 1) < 0)
3508 return NULL;
3509
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003510 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003511 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003512
3513 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003514 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003515}
3516
Alexander Belopolsky40018472011-02-26 01:02:56 +00003517PyObject *
3518PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003519 const char *encoding,
3520 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003521{
3522 PyObject *v;
3523
3524 if (!PyUnicode_Check(unicode)) {
3525 PyErr_BadArgument();
3526 goto onError;
3527 }
3528
Serhiy Storchaka00939072016-10-27 21:05:49 +03003529 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3530 "PyUnicode_AsDecodedUnicode() is deprecated; "
3531 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3532 return NULL;
3533
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003536
3537 /* Decode via the codec registry */
3538 v = PyCodec_Decode(unicode, encoding, errors);
3539 if (v == NULL)
3540 goto onError;
3541 if (!PyUnicode_Check(v)) {
3542 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003543 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003544 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003545 encoding,
3546 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003547 Py_DECREF(v);
3548 goto onError;
3549 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003550 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003551
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003553 return NULL;
3554}
3555
Alexander Belopolsky40018472011-02-26 01:02:56 +00003556PyObject *
3557PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003558 Py_ssize_t size,
3559 const char *encoding,
3560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561{
3562 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003563
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003564 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003567 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3568 Py_DECREF(unicode);
3569 return v;
3570}
3571
Alexander Belopolsky40018472011-02-26 01:02:56 +00003572PyObject *
3573PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003574 const char *encoding,
3575 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003576{
3577 PyObject *v;
3578
3579 if (!PyUnicode_Check(unicode)) {
3580 PyErr_BadArgument();
3581 goto onError;
3582 }
3583
Serhiy Storchaka00939072016-10-27 21:05:49 +03003584 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3585 "PyUnicode_AsEncodedObject() is deprecated; "
3586 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3587 "or PyCodec_Encode() for generic encoding", 1) < 0)
3588 return NULL;
3589
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003590 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003592
3593 /* Encode via the codec registry */
3594 v = PyCodec_Encode(unicode, encoding, errors);
3595 if (v == NULL)
3596 goto onError;
3597 return v;
3598
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003600 return NULL;
3601}
3602
Victor Stinner1b579672011-12-17 05:47:23 +01003603
Victor Stinner2cba6b82018-01-10 22:46:15 +01003604static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003605unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003606 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003607{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003608 Py_ssize_t wlen;
3609 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3610 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003611 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003612 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003613
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003614 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003615 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003616 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003617 return NULL;
3618 }
3619
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003620 char *str;
3621 size_t error_pos;
3622 const char *reason;
3623 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003624 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003625 PyMem_Free(wstr);
3626
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003627 if (res != 0) {
3628 if (res == -2) {
3629 PyObject *exc;
3630 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3631 "locale", unicode,
3632 (Py_ssize_t)error_pos,
3633 (Py_ssize_t)(error_pos+1),
3634 reason);
3635 if (exc != NULL) {
3636 PyCodec_StrictErrors(exc);
3637 Py_DECREF(exc);
3638 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003639 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003640 else if (res == -3) {
3641 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3642 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003643 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003644 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003645 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003646 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003647 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003648
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003649 PyObject *bytes = PyBytes_FromString(str);
3650 PyMem_RawFree(str);
3651 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003652}
3653
Victor Stinnerad158722010-10-27 00:25:46 +00003654PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003655PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3656{
Victor Stinner709d23d2019-05-02 14:56:30 -04003657 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3658 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003659}
3660
3661PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003662PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003663{
Victor Stinner81a7be32020-04-14 15:14:01 +02003664 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003665 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3666 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003667 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003668 fs_codec->error_handler,
3669 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003670 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003671#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003672 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003673 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003674 fs_codec->encoding,
3675 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003676 }
Victor Stinnerad158722010-10-27 00:25:46 +00003677#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003678 else {
3679 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3680 machinery is not ready and so cannot be used:
3681 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003682 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3683 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003684 assert(filesystem_errors != NULL);
3685 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3686 assert(errors != _Py_ERROR_UNKNOWN);
3687#ifdef _Py_FORCE_UTF8_FS_ENCODING
3688 return unicode_encode_utf8(unicode, errors, NULL);
3689#else
3690 return unicode_encode_locale(unicode, errors, 0);
3691#endif
3692 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003693}
3694
Alexander Belopolsky40018472011-02-26 01:02:56 +00003695PyObject *
3696PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003697 const char *encoding,
3698 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699{
3700 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003701 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 if (!PyUnicode_Check(unicode)) {
3704 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003705 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 }
Fred Drakee4315f52000-05-09 19:53:39 +00003707
Victor Stinner22eb6892019-06-26 00:51:05 +02003708 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3709 return NULL;
3710 }
3711
Victor Stinner942889a2016-09-05 15:40:10 -07003712 if (encoding == NULL) {
3713 return _PyUnicode_AsUTF8String(unicode, errors);
3714 }
3715
Fred Drakee4315f52000-05-09 19:53:39 +00003716 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003717 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3718 char *lower = buflower;
3719
3720 /* Fast paths */
3721 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3722 lower += 3;
3723 if (*lower == '_') {
3724 /* Match "utf8" and "utf_8" */
3725 lower++;
3726 }
3727
3728 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003730 }
3731 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3732 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3733 }
3734 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3735 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3736 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003737 }
Victor Stinner942889a2016-09-05 15:40:10 -07003738 else {
3739 if (strcmp(lower, "ascii") == 0
3740 || strcmp(lower, "us_ascii") == 0) {
3741 return _PyUnicode_AsASCIIString(unicode, errors);
3742 }
Steve Dowercc16be82016-09-08 10:35:16 -07003743#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003744 else if (strcmp(lower, "mbcs") == 0) {
3745 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3746 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003747#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003748 else if (strcmp(lower, "latin1") == 0 ||
3749 strcmp(lower, "latin_1") == 0 ||
3750 strcmp(lower, "iso_8859_1") == 0 ||
3751 strcmp(lower, "iso8859_1") == 0) {
3752 return _PyUnicode_AsLatin1String(unicode, errors);
3753 }
3754 }
Victor Stinner37296e82010-06-10 13:36:23 +00003755 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756
3757 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003758 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003760 return NULL;
3761
3762 /* The normal path */
3763 if (PyBytes_Check(v))
3764 return v;
3765
3766 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003767 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003768 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003769 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003770
3771 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003772 "encoder %s returned bytearray instead of bytes; "
3773 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003774 encoding);
3775 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003776 Py_DECREF(v);
3777 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003778 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003779
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003780 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3781 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003782 Py_DECREF(v);
3783 return b;
3784 }
3785
3786 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003787 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003788 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003789 encoding,
3790 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003791 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003792 return NULL;
3793}
3794
Alexander Belopolsky40018472011-02-26 01:02:56 +00003795PyObject *
3796PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003797 const char *encoding,
3798 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003799{
3800 PyObject *v;
3801
3802 if (!PyUnicode_Check(unicode)) {
3803 PyErr_BadArgument();
3804 goto onError;
3805 }
3806
Serhiy Storchaka00939072016-10-27 21:05:49 +03003807 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3808 "PyUnicode_AsEncodedUnicode() is deprecated; "
3809 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3810 return NULL;
3811
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003812 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003813 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003814
3815 /* Encode via the codec registry */
3816 v = PyCodec_Encode(unicode, encoding, errors);
3817 if (v == NULL)
3818 goto onError;
3819 if (!PyUnicode_Check(v)) {
3820 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003821 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003822 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003823 encoding,
3824 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003825 Py_DECREF(v);
3826 goto onError;
3827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003829
Benjamin Peterson29060642009-01-31 22:14:21 +00003830 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 return NULL;
3832}
3833
Victor Stinner2cba6b82018-01-10 22:46:15 +01003834static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003835unicode_decode_locale(const char *str, Py_ssize_t len,
3836 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003837{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003838 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3839 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003840 return NULL;
3841 }
3842
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003843 wchar_t *wstr;
3844 size_t wlen;
3845 const char *reason;
3846 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003847 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003848 if (res != 0) {
3849 if (res == -2) {
3850 PyObject *exc;
3851 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3852 "locale", str, len,
3853 (Py_ssize_t)wlen,
3854 (Py_ssize_t)(wlen + 1),
3855 reason);
3856 if (exc != NULL) {
3857 PyCodec_StrictErrors(exc);
3858 Py_DECREF(exc);
3859 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003860 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003861 else if (res == -3) {
3862 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3863 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003864 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003865 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003866 }
Victor Stinner2f197072011-12-17 07:08:30 +01003867 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003868 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003869
3870 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3871 PyMem_RawFree(wstr);
3872 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003873}
3874
3875PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003876PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3877 const char *errors)
3878{
Victor Stinner709d23d2019-05-02 14:56:30 -04003879 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3880 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003881}
3882
3883PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003884PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003885{
3886 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003887 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3888 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003889}
3890
3891
3892PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003893PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003894 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003895 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3896}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003897
Christian Heimes5894ba72007-11-04 11:43:14 +00003898PyObject*
3899PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3900{
Victor Stinner81a7be32020-04-14 15:14:01 +02003901 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003902 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3903 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003904 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003905 fs_codec->error_handler,
3906 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003907 NULL);
3908 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003909#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003910 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003911 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003912 fs_codec->encoding,
3913 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003914 }
Victor Stinnerad158722010-10-27 00:25:46 +00003915#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003916 else {
3917 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3918 machinery is not ready and so cannot be used:
3919 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003920 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3921 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003922 assert(filesystem_errors != NULL);
3923 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3924 assert(errors != _Py_ERROR_UNKNOWN);
3925#ifdef _Py_FORCE_UTF8_FS_ENCODING
3926 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3927#else
3928 return unicode_decode_locale(s, size, errors, 0);
3929#endif
3930 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003931}
3932
Martin v. Löwis011e8422009-05-05 04:43:17 +00003933
3934int
3935PyUnicode_FSConverter(PyObject* arg, void* addr)
3936{
Brett Cannonec6ce872016-09-06 15:50:29 -07003937 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003938 PyObject *output = NULL;
3939 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003940 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003941 if (arg == NULL) {
3942 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003943 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003944 return 1;
3945 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003946 path = PyOS_FSPath(arg);
3947 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003948 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003949 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003950 if (PyBytes_Check(path)) {
3951 output = path;
3952 }
3953 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3954 output = PyUnicode_EncodeFSDefault(path);
3955 Py_DECREF(path);
3956 if (!output) {
3957 return 0;
3958 }
3959 assert(PyBytes_Check(output));
3960 }
3961
Victor Stinner0ea2a462010-04-30 00:22:08 +00003962 size = PyBytes_GET_SIZE(output);
3963 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003964 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003965 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003966 Py_DECREF(output);
3967 return 0;
3968 }
3969 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003970 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003971}
3972
3973
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003974int
3975PyUnicode_FSDecoder(PyObject* arg, void* addr)
3976{
Brett Cannona5711202016-09-06 19:36:01 -07003977 int is_buffer = 0;
3978 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003979 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003980 if (arg == NULL) {
3981 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003982 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003983 return 1;
3984 }
Brett Cannona5711202016-09-06 19:36:01 -07003985
3986 is_buffer = PyObject_CheckBuffer(arg);
3987 if (!is_buffer) {
3988 path = PyOS_FSPath(arg);
3989 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003990 return 0;
3991 }
Brett Cannona5711202016-09-06 19:36:01 -07003992 }
3993 else {
3994 path = arg;
3995 Py_INCREF(arg);
3996 }
3997
3998 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003999 output = path;
4000 }
4001 else if (PyBytes_Check(path) || is_buffer) {
4002 PyObject *path_bytes = NULL;
4003
4004 if (!PyBytes_Check(path) &&
4005 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004006 "path should be string, bytes, or os.PathLike, not %.200s",
4007 Py_TYPE(arg)->tp_name)) {
4008 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004009 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004010 }
4011 path_bytes = PyBytes_FromObject(path);
4012 Py_DECREF(path);
4013 if (!path_bytes) {
4014 return 0;
4015 }
4016 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4017 PyBytes_GET_SIZE(path_bytes));
4018 Py_DECREF(path_bytes);
4019 if (!output) {
4020 return 0;
4021 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004022 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004023 else {
4024 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004025 "path should be string, bytes, or os.PathLike, not %.200s",
4026 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004027 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004028 return 0;
4029 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004030 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004031 Py_DECREF(output);
4032 return 0;
4033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004035 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004036 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004037 Py_DECREF(output);
4038 return 0;
4039 }
4040 *(PyObject**)addr = output;
4041 return Py_CLEANUP_SUPPORTED;
4042}
4043
4044
Inada Naoki02a4d572020-02-27 13:48:59 +09004045static int unicode_fill_utf8(PyObject *unicode);
4046
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004047const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004049{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004050 if (!PyUnicode_Check(unicode)) {
4051 PyErr_BadArgument();
4052 return NULL;
4053 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004054 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004057 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004058 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 return NULL;
4060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 }
4062
4063 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004064 *psize = PyUnicode_UTF8_LENGTH(unicode);
4065 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004066}
4067
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004068const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4072}
4073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074Py_UNICODE *
4075PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 if (!PyUnicode_Check(unicode)) {
4078 PyErr_BadArgument();
4079 return NULL;
4080 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004081 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4082 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004084 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004085 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086
Serhiy Storchakac46db922018-10-23 22:58:24 +03004087 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4088 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4089 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004092 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4093 if (w == NULL) {
4094 PyErr_NoMemory();
4095 return NULL;
4096 }
4097 unicode_copy_as_widechar(unicode, w, wlen + 1);
4098 _PyUnicode_WSTR(unicode) = w;
4099 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4100 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101 }
4102 }
4103 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004104 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004105 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004106}
4107
Inada Naoki2c4928d2020-06-17 20:09:44 +09004108/* Deprecated APIs */
4109
4110_Py_COMP_DIAG_PUSH
4111_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4112
Alexander Belopolsky40018472011-02-26 01:02:56 +00004113Py_UNICODE *
4114PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117}
4118
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004119const Py_UNICODE *
4120_PyUnicode_AsUnicode(PyObject *unicode)
4121{
4122 Py_ssize_t size;
4123 const Py_UNICODE *wstr;
4124
4125 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4126 if (wstr && wcslen(wstr) != (size_t)size) {
4127 PyErr_SetString(PyExc_ValueError, "embedded null character");
4128 return NULL;
4129 }
4130 return wstr;
4131}
4132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133
Alexander Belopolsky40018472011-02-26 01:02:56 +00004134Py_ssize_t
4135PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136{
4137 if (!PyUnicode_Check(unicode)) {
4138 PyErr_BadArgument();
4139 goto onError;
4140 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004141 if (_PyUnicode_WSTR(unicode) == NULL) {
4142 if (PyUnicode_AsUnicode(unicode) == NULL)
4143 goto onError;
4144 }
4145 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 return -1;
4149}
4150
Inada Naoki2c4928d2020-06-17 20:09:44 +09004151_Py_COMP_DIAG_POP
4152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004153Py_ssize_t
4154PyUnicode_GetLength(PyObject *unicode)
4155{
Victor Stinner07621332012-06-16 04:53:46 +02004156 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157 PyErr_BadArgument();
4158 return -1;
4159 }
Victor Stinner07621332012-06-16 04:53:46 +02004160 if (PyUnicode_READY(unicode) == -1)
4161 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162 return PyUnicode_GET_LENGTH(unicode);
4163}
4164
4165Py_UCS4
4166PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4167{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004168 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004169 int kind;
4170
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004171 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004172 PyErr_BadArgument();
4173 return (Py_UCS4)-1;
4174 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004175 if (PyUnicode_READY(unicode) == -1) {
4176 return (Py_UCS4)-1;
4177 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004178 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004179 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004180 return (Py_UCS4)-1;
4181 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004182 data = PyUnicode_DATA(unicode);
4183 kind = PyUnicode_KIND(unicode);
4184 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185}
4186
4187int
4188PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4189{
4190 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004191 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192 return -1;
4193 }
Victor Stinner488fa492011-12-12 00:01:39 +01004194 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004195 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004196 PyErr_SetString(PyExc_IndexError, "string index out of range");
4197 return -1;
4198 }
Victor Stinner488fa492011-12-12 00:01:39 +01004199 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004200 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004201 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4202 PyErr_SetString(PyExc_ValueError, "character out of range");
4203 return -1;
4204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4206 index, ch);
4207 return 0;
4208}
4209
Alexander Belopolsky40018472011-02-26 01:02:56 +00004210const char *
4211PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004212{
Victor Stinner42cb4622010-09-01 19:39:01 +00004213 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004214}
4215
Victor Stinner554f3f02010-06-16 23:33:54 +00004216/* create or adjust a UnicodeDecodeError */
4217static void
4218make_decode_exception(PyObject **exceptionObject,
4219 const char *encoding,
4220 const char *input, Py_ssize_t length,
4221 Py_ssize_t startpos, Py_ssize_t endpos,
4222 const char *reason)
4223{
4224 if (*exceptionObject == NULL) {
4225 *exceptionObject = PyUnicodeDecodeError_Create(
4226 encoding, input, length, startpos, endpos, reason);
4227 }
4228 else {
4229 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4230 goto onError;
4231 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4232 goto onError;
4233 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4234 goto onError;
4235 }
4236 return;
4237
4238onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004239 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004240}
4241
Steve Dowercc16be82016-09-08 10:35:16 -07004242#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004243static int
4244widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4245{
4246 if (newsize > *size) {
4247 wchar_t *newbuf = *buf;
4248 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4249 PyErr_NoMemory();
4250 return -1;
4251 }
4252 *buf = newbuf;
4253 }
4254 *size = newsize;
4255 return 0;
4256}
4257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258/* error handling callback helper:
4259 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004260 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 and adjust various state variables.
4262 return 0 on success, -1 on error
4263*/
4264
Alexander Belopolsky40018472011-02-26 01:02:56 +00004265static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266unicode_decode_call_errorhandler_wchar(
4267 const char *errors, PyObject **errorHandler,
4268 const char *encoding, const char *reason,
4269 const char **input, const char **inend, Py_ssize_t *startinpos,
4270 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004271 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004273 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274
4275 PyObject *restuple = NULL;
4276 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004277 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004278 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004279 Py_ssize_t requiredsize;
4280 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004281 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282 wchar_t *repwstr;
4283 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004284
4285 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 *errorHandler = PyCodec_LookupError(errors);
4287 if (*errorHandler == NULL)
4288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 }
4290
Victor Stinner554f3f02010-06-16 23:33:54 +00004291 make_decode_exception(exceptionObject,
4292 encoding,
4293 *input, *inend - *input,
4294 *startinpos, *endinpos,
4295 reason);
4296 if (*exceptionObject == NULL)
4297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004298
Petr Viktorinffd97532020-02-11 17:46:57 +01004299 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004300 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004302 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004303 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004306 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308
4309 /* Copy back the bytes variables, which might have been modified by the
4310 callback */
4311 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4312 if (!inputobj)
4313 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 *input = PyBytes_AS_STRING(inputobj);
4315 insize = PyBytes_GET_SIZE(inputobj);
4316 *inend = *input + insize;
4317 /* we can DECREF safely, as the exception has another reference,
4318 so the object won't go away. */
4319 Py_DECREF(inputobj);
4320
4321 if (newpos<0)
4322 newpos = insize+newpos;
4323 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 goto onError;
4326 }
4327
4328 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4329 if (repwstr == NULL)
4330 goto onError;
4331 /* need more space? (at least enough for what we
4332 have+the replacement+the rest of the string (starting
4333 at the new input position), so we won't have to check space
4334 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004335 requiredsize = *outpos;
4336 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4337 goto overflow;
4338 requiredsize += repwlen;
4339 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4340 goto overflow;
4341 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004342 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004344 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004346 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004348 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004349 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004350 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352 *endinpos = newpos;
4353 *inptr = *input + newpos;
4354
4355 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004356 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004357 return 0;
4358
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004359 overflow:
4360 PyErr_SetString(PyExc_OverflowError,
4361 "decoded result is too long for a Python string");
4362
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363 onError:
4364 Py_XDECREF(restuple);
4365 return -1;
4366}
Steve Dowercc16be82016-09-08 10:35:16 -07004367#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368
4369static int
4370unicode_decode_call_errorhandler_writer(
4371 const char *errors, PyObject **errorHandler,
4372 const char *encoding, const char *reason,
4373 const char **input, const char **inend, Py_ssize_t *startinpos,
4374 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4375 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4376{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004377 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004378
4379 PyObject *restuple = NULL;
4380 PyObject *repunicode = NULL;
4381 Py_ssize_t insize;
4382 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004383 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004384 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004385 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004386 int need_to_grow = 0;
4387 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388
4389 if (*errorHandler == NULL) {
4390 *errorHandler = PyCodec_LookupError(errors);
4391 if (*errorHandler == NULL)
4392 goto onError;
4393 }
4394
4395 make_decode_exception(exceptionObject,
4396 encoding,
4397 *input, *inend - *input,
4398 *startinpos, *endinpos,
4399 reason);
4400 if (*exceptionObject == NULL)
4401 goto onError;
4402
Petr Viktorinffd97532020-02-11 17:46:57 +01004403 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 if (restuple == NULL)
4405 goto onError;
4406 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004407 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 goto onError;
4409 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004410 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004412
4413 /* Copy back the bytes variables, which might have been modified by the
4414 callback */
4415 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4416 if (!inputobj)
4417 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004418 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004419 *input = PyBytes_AS_STRING(inputobj);
4420 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004421 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004422 /* we can DECREF safely, as the exception has another reference,
4423 so the object won't go away. */
4424 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004425
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004428 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004429 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004430 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004431 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432
Victor Stinner170ca6f2013-04-18 00:25:28 +02004433 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004434 if (replen > 1) {
4435 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004436 need_to_grow = 1;
4437 }
4438 new_inptr = *input + newpos;
4439 if (*inend - new_inptr > remain) {
4440 /* We don't know the decoding algorithm here so we make the worst
4441 assumption that one byte decodes to one unicode character.
4442 If unfortunately one byte could decode to more unicode characters,
4443 the decoder may write out-of-bound then. Is it possible for the
4444 algorithms using this function? */
4445 writer->min_length += *inend - new_inptr - remain;
4446 need_to_grow = 1;
4447 }
4448 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004449 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004450 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004451 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4452 goto onError;
4453 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004454 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004455 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004457 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004458 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004459
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004461 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467}
4468
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469/* --- UTF-7 Codec -------------------------------------------------------- */
4470
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471/* See RFC2152 for details. We encode conservatively and decode liberally. */
4472
4473/* Three simple macros defining base-64. */
4474
4475/* Is c a base-64 character? */
4476
4477#define IS_BASE64(c) \
4478 (((c) >= 'A' && (c) <= 'Z') || \
4479 ((c) >= 'a' && (c) <= 'z') || \
4480 ((c) >= '0' && (c) <= '9') || \
4481 (c) == '+' || (c) == '/')
4482
4483/* given that c is a base-64 character, what is its base-64 value? */
4484
4485#define FROM_BASE64(c) \
4486 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4487 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4488 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4489 (c) == '+' ? 62 : 63)
4490
4491/* What is the base-64 character of the bottom 6 bits of n? */
4492
4493#define TO_BASE64(n) \
4494 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4495
4496/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4497 * decoded as itself. We are permissive on decoding; the only ASCII
4498 * byte not decoding to itself is the + which begins a base64
4499 * string. */
4500
4501#define DECODE_DIRECT(c) \
4502 ((c) <= 127 && (c) != '+')
4503
4504/* The UTF-7 encoder treats ASCII characters differently according to
4505 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4506 * the above). See RFC2152. This array identifies these different
4507 * sets:
4508 * 0 : "Set D"
4509 * alphanumeric and '(),-./:?
4510 * 1 : "Set O"
4511 * !"#$%&*;<=>@[]^_`{|}
4512 * 2 : "whitespace"
4513 * ht nl cr sp
4514 * 3 : special (must be base64 encoded)
4515 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4516 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517
Tim Petersced69f82003-09-16 20:30:58 +00004518static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519char utf7_category[128] = {
4520/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4521 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4522/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4523 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4524/* sp ! " # $ % & ' ( ) * + , - . / */
4525 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4526/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4528/* @ A B C D E F G H I J K L M N O */
4529 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4530/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4532/* ` a b c d e f g h i j k l m n o */
4533 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4534/* p q r s t u v w x y z { | } ~ del */
4535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536};
4537
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538/* ENCODE_DIRECT: this character should be encoded as itself. The
4539 * answer depends on whether we are encoding set O as itself, and also
4540 * on whether we are encoding whitespace as itself. RFC2152 makes it
4541 * clear that the answers to these questions vary between
4542 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004543
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544#define ENCODE_DIRECT(c, directO, directWS) \
4545 ((c) < 128 && (c) > 0 && \
4546 ((utf7_category[(c)] == 0) || \
4547 (directWS && (utf7_category[(c)] == 2)) || \
4548 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549
Alexander Belopolsky40018472011-02-26 01:02:56 +00004550PyObject *
4551PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004552 Py_ssize_t size,
4553 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004555 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4556}
4557
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558/* The decoder. The only state we preserve is our read position,
4559 * i.e. how many characters we have consumed. So if we end in the
4560 * middle of a shift sequence we have to back off the read position
4561 * and the output to the beginning of the sequence, otherwise we lose
4562 * all the shift state (seen bits, number of bits seen, high
4563 * surrogate). */
4564
Alexander Belopolsky40018472011-02-26 01:02:56 +00004565PyObject *
4566PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004567 Py_ssize_t size,
4568 const char *errors,
4569 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004570{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004572 Py_ssize_t startinpos;
4573 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004574 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004575 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 const char *errmsg = "";
4577 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 unsigned int base64bits = 0;
4580 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004581 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 PyObject *errorHandler = NULL;
4583 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004585 if (size == 0) {
4586 if (consumed)
4587 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004588 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004589 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004591 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004592 _PyUnicodeWriter_Init(&writer);
4593 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004594
4595 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004596 e = s + size;
4597
4598 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004601 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 if (inShift) { /* in a base-64 section */
4604 if (IS_BASE64(ch)) { /* consume a base-64 character */
4605 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4606 base64bits += 6;
4607 s++;
4608 if (base64bits >= 16) {
4609 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004610 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 base64bits -= 16;
4612 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004613 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 if (surrogate) {
4615 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004616 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4617 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004618 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004619 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004621 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 }
4623 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004624 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004625 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 }
4628 }
Victor Stinner551ac952011-11-29 22:58:13 +01004629 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 /* first surrogate */
4631 surrogate = outCh;
4632 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004634 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004635 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 }
4637 }
4638 }
4639 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 if (base64bits > 0) { /* left-over bits */
4642 if (base64bits >= 6) {
4643 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004644 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 errmsg = "partial character in shift sequence";
4646 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 else {
4649 /* Some bits remain; they should be zero */
4650 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004651 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 errmsg = "non-zero padding bits in shift sequence";
4653 goto utf7Error;
4654 }
4655 }
4656 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004657 if (surrogate && DECODE_DIRECT(ch)) {
4658 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4659 goto onError;
4660 }
4661 surrogate = 0;
4662 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 /* '-' is absorbed; other terminating
4664 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004665 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004666 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 }
4668 }
4669 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 s++; /* consume '+' */
4672 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004674 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004675 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004677 else if (s < e && !IS_BASE64(*s)) {
4678 s++;
4679 errmsg = "ill-formed sequence";
4680 goto utf7Error;
4681 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004682 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004684 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004685 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004687 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688 }
4689 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004692 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004693 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004695 else {
4696 startinpos = s-starts;
4697 s++;
4698 errmsg = "unexpected special character";
4699 goto utf7Error;
4700 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004704 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 errors, &errorHandler,
4706 "utf7", errmsg,
4707 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004708 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004710 }
4711
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 /* end of string */
4713
4714 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4715 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004716 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717 if (surrogate ||
4718 (base64bits >= 6) ||
4719 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004720 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004721 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 errors, &errorHandler,
4723 "utf7", "unterminated shift sequence",
4724 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 goto onError;
4727 if (s < e)
4728 goto restart;
4729 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004731
4732 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004733 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004734 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004735 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004736 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004737 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004738 writer.kind, writer.data, shiftOutStart);
4739 Py_XDECREF(errorHandler);
4740 Py_XDECREF(exc);
4741 _PyUnicodeWriter_Dealloc(&writer);
4742 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004743 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004744 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745 }
4746 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004747 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004748 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004749 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 Py_XDECREF(errorHandler);
4752 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004753 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754
Benjamin Peterson29060642009-01-31 22:14:21 +00004755 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004756 Py_XDECREF(errorHandler);
4757 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004758 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 return NULL;
4760}
4761
4762
Alexander Belopolsky40018472011-02-26 01:02:56 +00004763PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004764_PyUnicode_EncodeUTF7(PyObject *str,
4765 int base64SetO,
4766 int base64WhiteSpace,
4767 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004768{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004769 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004770 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004771 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004772 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004774 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 unsigned int base64bits = 0;
4776 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004777 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004778 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004779
Benjamin Petersonbac79492012-01-14 13:34:47 -05004780 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004781 return NULL;
4782 kind = PyUnicode_KIND(str);
4783 data = PyUnicode_DATA(str);
4784 len = PyUnicode_GET_LENGTH(str);
4785
4786 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004789 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004790 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004791 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004792 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 if (v == NULL)
4794 return NULL;
4795
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004796 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004797 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004798 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004799
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 if (inShift) {
4801 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4802 /* shifting out */
4803 if (base64bits) { /* output remaining bits */
4804 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4805 base64buffer = 0;
4806 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004807 }
4808 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 /* Characters not in the BASE64 set implicitly unshift the sequence
4810 so no '-' is required, except if the character is itself a '-' */
4811 if (IS_BASE64(ch) || ch == '-') {
4812 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 *out++ = (char) ch;
4815 }
4816 else {
4817 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004818 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004819 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004820 else { /* not in a shift sequence */
4821 if (ch == '+') {
4822 *out++ = '+';
4823 *out++ = '-';
4824 }
4825 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4826 *out++ = (char) ch;
4827 }
4828 else {
4829 *out++ = '+';
4830 inShift = 1;
4831 goto encode_char;
4832 }
4833 }
4834 continue;
4835encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004836 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004837 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004838
Antoine Pitrou244651a2009-05-04 18:56:13 +00004839 /* code first surrogate */
4840 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004841 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004842 while (base64bits >= 6) {
4843 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4844 base64bits -= 6;
4845 }
4846 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004847 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004848 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004849 base64bits += 16;
4850 base64buffer = (base64buffer << 16) | ch;
4851 while (base64bits >= 6) {
4852 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4853 base64bits -= 6;
4854 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004855 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004856 if (base64bits)
4857 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4858 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004859 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004860 if (_PyBytes_Resize(&v, out - start) < 0)
4861 return NULL;
4862 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004863}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004864PyObject *
4865PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4866 Py_ssize_t size,
4867 int base64SetO,
4868 int base64WhiteSpace,
4869 const char *errors)
4870{
4871 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004872 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004873 if (tmp == NULL)
4874 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004875 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004876 base64WhiteSpace, errors);
4877 Py_DECREF(tmp);
4878 return result;
4879}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004880
Antoine Pitrou244651a2009-05-04 18:56:13 +00004881#undef IS_BASE64
4882#undef FROM_BASE64
4883#undef TO_BASE64
4884#undef DECODE_DIRECT
4885#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004886
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887/* --- UTF-8 Codec -------------------------------------------------------- */
4888
Alexander Belopolsky40018472011-02-26 01:02:56 +00004889PyObject *
4890PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004891 Py_ssize_t size,
4892 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893{
Walter Dörwald69652032004-09-07 20:24:22 +00004894 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4895}
4896
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004897#include "stringlib/asciilib.h"
4898#include "stringlib/codecs.h"
4899#include "stringlib/undef.h"
4900
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004901#include "stringlib/ucs1lib.h"
4902#include "stringlib/codecs.h"
4903#include "stringlib/undef.h"
4904
4905#include "stringlib/ucs2lib.h"
4906#include "stringlib/codecs.h"
4907#include "stringlib/undef.h"
4908
4909#include "stringlib/ucs4lib.h"
4910#include "stringlib/codecs.h"
4911#include "stringlib/undef.h"
4912
Antoine Pitrouab868312009-01-10 15:40:25 +00004913/* Mask to quickly check whether a C 'long' contains a
4914 non-ASCII, UTF8-encoded char. */
4915#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004916# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004917#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004918# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004919#else
4920# error C 'long' size should be either 4 or 8!
4921#endif
4922
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923static Py_ssize_t
4924ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004925{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004927 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004928
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004929 /*
4930 * Issue #17237: m68k is a bit different from most architectures in
4931 * that objects do not use "natural alignment" - for example, int and
4932 * long are only aligned at 2-byte boundaries. Therefore the assert()
4933 * won't work; also, tests have shown that skipping the "optimised
4934 * version" will even speed up m68k.
4935 */
4936#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004938 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4939 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 /* Fast path, see in STRINGLIB(utf8_decode) for
4941 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004942 /* Help allocation */
4943 const char *_p = p;
4944 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 while (_p < aligned_end) {
4946 unsigned long value = *(const unsigned long *) _p;
4947 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 *((unsigned long *)q) = value;
4950 _p += SIZEOF_LONG;
4951 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004952 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004953 p = _p;
4954 while (p < end) {
4955 if ((unsigned char)*p & 0x80)
4956 break;
4957 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004962#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 while (p < end) {
4964 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4965 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004966 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004967 /* Help allocation */
4968 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004969 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004970 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 if (value & ASCII_CHAR_MASK)
4972 break;
4973 _p += SIZEOF_LONG;
4974 }
4975 p = _p;
4976 if (_p == end)
4977 break;
4978 }
4979 if ((unsigned char)*p & 0x80)
4980 break;
4981 ++p;
4982 }
4983 memcpy(dest, start, p - start);
4984 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985}
Antoine Pitrouab868312009-01-10 15:40:25 +00004986
Victor Stinner709d23d2019-05-02 14:56:30 -04004987static PyObject *
4988unicode_decode_utf8(const char *s, Py_ssize_t size,
4989 _Py_error_handler error_handler, const char *errors,
4990 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004991{
Victor Stinner785938e2011-12-11 20:09:03 +01004992 if (size == 0) {
4993 if (consumed)
4994 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004996 }
4997
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4999 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005000 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005002 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005004 }
5005
Inada Naoki770847a2019-06-24 12:30:24 +09005006 const char *starts = s;
5007 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005008
Inada Naoki770847a2019-06-24 12:30:24 +09005009 // fast path: try ASCII string.
5010 PyObject *u = PyUnicode_New(size, 127);
5011 if (u == NULL) {
5012 return NULL;
5013 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005014 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005015 if (s == end) {
5016 return u;
5017 }
5018
5019 // Use _PyUnicodeWriter after fast path is failed.
5020 _PyUnicodeWriter writer;
5021 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5022 writer.pos = s - starts;
5023
5024 Py_ssize_t startinpos, endinpos;
5025 const char *errmsg = "";
5026 PyObject *error_handler_obj = NULL;
5027 PyObject *exc = NULL;
5028
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005029 while (s < end) {
5030 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005031 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005032
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005034 if (PyUnicode_IS_ASCII(writer.buffer))
5035 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005039 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005040 } else {
5041 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005042 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 }
5044
5045 switch (ch) {
5046 case 0:
5047 if (s == end || consumed)
5048 goto End;
5049 errmsg = "unexpected end of data";
5050 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005051 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005052 break;
5053 case 1:
5054 errmsg = "invalid start byte";
5055 startinpos = s - starts;
5056 endinpos = startinpos + 1;
5057 break;
5058 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005059 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5060 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5061 {
5062 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005063 goto End;
5064 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005065 /* fall through */
5066 case 3:
5067 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 errmsg = "invalid continuation byte";
5069 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005070 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071 break;
5072 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005073 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 goto onError;
5075 continue;
5076 }
5077
Victor Stinner1d65d912015-10-05 13:43:50 +02005078 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005079 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005080
5081 switch (error_handler) {
5082 case _Py_ERROR_IGNORE:
5083 s += (endinpos - startinpos);
5084 break;
5085
5086 case _Py_ERROR_REPLACE:
5087 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5088 goto onError;
5089 s += (endinpos - startinpos);
5090 break;
5091
5092 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005093 {
5094 Py_ssize_t i;
5095
Victor Stinner1d65d912015-10-05 13:43:50 +02005096 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5097 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005098 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005099 ch = (Py_UCS4)(unsigned char)(starts[i]);
5100 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5101 ch + 0xdc00);
5102 writer.pos++;
5103 }
5104 s += (endinpos - startinpos);
5105 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005106 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005107
5108 default:
5109 if (unicode_decode_call_errorhandler_writer(
5110 errors, &error_handler_obj,
5111 "utf-8", errmsg,
5112 &starts, &end, &startinpos, &endinpos, &exc, &s,
5113 &writer))
5114 goto onError;
5115 }
Victor Stinner785938e2011-12-11 20:09:03 +01005116 }
5117
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005118End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 if (consumed)
5120 *consumed = s - starts;
5121
Victor Stinner1d65d912015-10-05 13:43:50 +02005122 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005123 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005124 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125
5126onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005127 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005129 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005131}
5132
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005133
Victor Stinner709d23d2019-05-02 14:56:30 -04005134PyObject *
5135PyUnicode_DecodeUTF8Stateful(const char *s,
5136 Py_ssize_t size,
5137 const char *errors,
5138 Py_ssize_t *consumed)
5139{
5140 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5141}
5142
5143
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005144/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5145 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005146
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005147 On success, write a pointer to a newly allocated wide character string into
5148 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5149 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005151 On memory allocation failure, return -1.
5152
5153 On decoding error (if surrogateescape is zero), return -2. If wlen is
5154 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5155 is not NULL, write the decoding error message into *reason. */
5156int
5157_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005158 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005159{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005161 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005162 wchar_t *unicode;
5163 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005164
Victor Stinner3d4226a2018-08-29 22:21:32 +02005165 int surrogateescape = 0;
5166 int surrogatepass = 0;
5167 switch (errors)
5168 {
5169 case _Py_ERROR_STRICT:
5170 break;
5171 case _Py_ERROR_SURROGATEESCAPE:
5172 surrogateescape = 1;
5173 break;
5174 case _Py_ERROR_SURROGATEPASS:
5175 surrogatepass = 1;
5176 break;
5177 default:
5178 return -3;
5179 }
5180
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005181 /* Note: size will always be longer than the resulting Unicode
5182 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005183 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005184 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005185 }
5186
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005187 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005188 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005189 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005190 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005191
5192 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005193 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005194 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005195 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005196 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005197#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005198 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005199#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005200 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005201#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005202 if (ch > 0xFF) {
5203#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005204 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005206 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005207 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005208 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5209 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5210#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005211 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005212 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005213 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005214 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005215 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005216
5217 if (surrogateescape) {
5218 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5219 }
5220 else {
5221 /* Is it a valid three-byte code? */
5222 if (surrogatepass
5223 && (e - s) >= 3
5224 && (s[0] & 0xf0) == 0xe0
5225 && (s[1] & 0xc0) == 0x80
5226 && (s[2] & 0xc0) == 0x80)
5227 {
5228 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5229 s += 3;
5230 unicode[outpos++] = ch;
5231 }
5232 else {
5233 PyMem_RawFree(unicode );
5234 if (reason != NULL) {
5235 switch (ch) {
5236 case 0:
5237 *reason = "unexpected end of data";
5238 break;
5239 case 1:
5240 *reason = "invalid start byte";
5241 break;
5242 /* 2, 3, 4 */
5243 default:
5244 *reason = "invalid continuation byte";
5245 break;
5246 }
5247 }
5248 if (wlen != NULL) {
5249 *wlen = s - orig_s;
5250 }
5251 return -2;
5252 }
5253 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005254 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005255 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005256 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005257 if (wlen) {
5258 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005259 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005260 *wstr = unicode;
5261 return 0;
5262}
5263
Victor Stinner5f9cf232019-03-19 01:46:25 +01005264
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005265wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005266_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5267 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005268{
5269 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005270 int res = _Py_DecodeUTF8Ex(arg, arglen,
5271 &wstr, wlen,
5272 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005273 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005274 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5275 assert(res != -3);
5276 if (wlen) {
5277 *wlen = (size_t)res;
5278 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005279 return NULL;
5280 }
5281 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005282}
5283
Antoine Pitrouab868312009-01-10 15:40:25 +00005284
Victor Stinnere47e6982017-12-21 15:45:16 +01005285/* UTF-8 encoder using the surrogateescape error handler .
5286
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005287 On success, return 0 and write the newly allocated character string (use
5288 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005289
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005290 On encoding failure, return -2 and write the position of the invalid
5291 surrogate character into *error_pos (if error_pos is set) and the decoding
5292 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005293
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005294 On memory allocation failure, return -1. */
5295int
5296_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005297 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005298{
5299 const Py_ssize_t max_char_size = 4;
5300 Py_ssize_t len = wcslen(text);
5301
5302 assert(len >= 0);
5303
Victor Stinner3d4226a2018-08-29 22:21:32 +02005304 int surrogateescape = 0;
5305 int surrogatepass = 0;
5306 switch (errors)
5307 {
5308 case _Py_ERROR_STRICT:
5309 break;
5310 case _Py_ERROR_SURROGATEESCAPE:
5311 surrogateescape = 1;
5312 break;
5313 case _Py_ERROR_SURROGATEPASS:
5314 surrogatepass = 1;
5315 break;
5316 default:
5317 return -3;
5318 }
5319
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005320 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5321 return -1;
5322 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005323 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005324 if (raw_malloc) {
5325 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005326 }
5327 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005328 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005329 }
5330 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005331 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005332 }
5333
5334 char *p = bytes;
5335 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005336 for (i = 0; i < len; ) {
5337 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005338 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005339 i++;
5340#if Py_UNICODE_SIZE == 2
5341 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5342 && i < len
5343 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5344 {
5345 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5346 i++;
5347 }
5348#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005349
5350 if (ch < 0x80) {
5351 /* Encode ASCII */
5352 *p++ = (char) ch;
5353
5354 }
5355 else if (ch < 0x0800) {
5356 /* Encode Latin-1 */
5357 *p++ = (char)(0xc0 | (ch >> 6));
5358 *p++ = (char)(0x80 | (ch & 0x3f));
5359 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005360 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005361 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005362 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005363 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005364 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005365 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005366 if (reason != NULL) {
5367 *reason = "encoding error";
5368 }
5369 if (raw_malloc) {
5370 PyMem_RawFree(bytes);
5371 }
5372 else {
5373 PyMem_Free(bytes);
5374 }
5375 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005376 }
5377 *p++ = (char)(ch & 0xff);
5378 }
5379 else if (ch < 0x10000) {
5380 *p++ = (char)(0xe0 | (ch >> 12));
5381 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5382 *p++ = (char)(0x80 | (ch & 0x3f));
5383 }
5384 else { /* ch >= 0x10000 */
5385 assert(ch <= MAX_UNICODE);
5386 /* Encode UCS4 Unicode ordinals */
5387 *p++ = (char)(0xf0 | (ch >> 18));
5388 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5389 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5390 *p++ = (char)(0x80 | (ch & 0x3f));
5391 }
5392 }
5393 *p++ = '\0';
5394
5395 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005396 char *bytes2;
5397 if (raw_malloc) {
5398 bytes2 = PyMem_RawRealloc(bytes, final_size);
5399 }
5400 else {
5401 bytes2 = PyMem_Realloc(bytes, final_size);
5402 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005403 if (bytes2 == NULL) {
5404 if (error_pos != NULL) {
5405 *error_pos = (size_t)-1;
5406 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005407 if (raw_malloc) {
5408 PyMem_RawFree(bytes);
5409 }
5410 else {
5411 PyMem_Free(bytes);
5412 }
5413 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005414 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005415 *str = bytes2;
5416 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005417}
5418
5419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005420/* Primary internal function which creates utf8 encoded bytes objects.
5421
5422 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005423 and allocate exactly as much space needed at the end. Else allocate the
5424 maximum possible needed (4 result bytes per Unicode character), and return
5425 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005426*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005427static PyObject *
5428unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5429 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 if (!PyUnicode_Check(unicode)) {
5432 PyErr_BadArgument();
5433 return NULL;
5434 }
5435
5436 if (PyUnicode_READY(unicode) == -1)
5437 return NULL;
5438
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005439 if (PyUnicode_UTF8(unicode))
5440 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5441 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005442
Inada Naoki02a4d572020-02-27 13:48:59 +09005443 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005444 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005445 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5446
5447 _PyBytesWriter writer;
5448 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449
Benjamin Petersonead6b532011-12-20 17:23:42 -06005450 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005451 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005452 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005453 case PyUnicode_1BYTE_KIND:
5454 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5455 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005456 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5457 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005458 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005459 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5460 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005461 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005462 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5463 break;
Tim Peters602f7402002-04-27 18:03:26 +00005464 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005465
5466 if (end == NULL) {
5467 _PyBytesWriter_Dealloc(&writer);
5468 return NULL;
5469 }
5470 return _PyBytesWriter_Finish(&writer, end);
5471}
5472
5473static int
5474unicode_fill_utf8(PyObject *unicode)
5475{
5476 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5477 assert(!PyUnicode_IS_ASCII(unicode));
5478
5479 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005480 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005481 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5482
5483 _PyBytesWriter writer;
5484 char *end;
5485
5486 switch (kind) {
5487 default:
5488 Py_UNREACHABLE();
5489 case PyUnicode_1BYTE_KIND:
5490 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5491 _Py_ERROR_STRICT, NULL);
5492 break;
5493 case PyUnicode_2BYTE_KIND:
5494 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5495 _Py_ERROR_STRICT, NULL);
5496 break;
5497 case PyUnicode_4BYTE_KIND:
5498 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5499 _Py_ERROR_STRICT, NULL);
5500 break;
5501 }
5502 if (end == NULL) {
5503 _PyBytesWriter_Dealloc(&writer);
5504 return -1;
5505 }
5506
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005507 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005508 PyBytes_AS_STRING(writer.buffer);
5509 Py_ssize_t len = end - start;
5510
5511 char *cache = PyObject_MALLOC(len + 1);
5512 if (cache == NULL) {
5513 _PyBytesWriter_Dealloc(&writer);
5514 PyErr_NoMemory();
5515 return -1;
5516 }
5517 _PyUnicode_UTF8(unicode) = cache;
5518 _PyUnicode_UTF8_LENGTH(unicode) = len;
5519 memcpy(cache, start, len);
5520 cache[len] = '\0';
5521 _PyBytesWriter_Dealloc(&writer);
5522 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523}
5524
Alexander Belopolsky40018472011-02-26 01:02:56 +00005525PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005526_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5527{
5528 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5529}
5530
5531
5532PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5534 Py_ssize_t size,
5535 const char *errors)
5536{
5537 PyObject *v, *unicode;
5538
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005539 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 if (unicode == NULL)
5541 return NULL;
5542 v = _PyUnicode_AsUTF8String(unicode, errors);
5543 Py_DECREF(unicode);
5544 return v;
5545}
5546
5547PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005548PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551}
5552
Walter Dörwald41980ca2007-08-16 21:55:45 +00005553/* --- UTF-32 Codec ------------------------------------------------------- */
5554
5555PyObject *
5556PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 Py_ssize_t size,
5558 const char *errors,
5559 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005560{
5561 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5562}
5563
5564PyObject *
5565PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 Py_ssize_t size,
5567 const char *errors,
5568 int *byteorder,
5569 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005570{
5571 const char *starts = s;
5572 Py_ssize_t startinpos;
5573 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005574 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005575 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005576 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005577 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005578 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005579 PyObject *errorHandler = NULL;
5580 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005581
Andy Lestere6be9b52020-02-11 20:28:35 -06005582 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005583 e = q + size;
5584
5585 if (byteorder)
5586 bo = *byteorder;
5587
5588 /* Check for BOM marks (U+FEFF) in the input and adjust current
5589 byte order setting accordingly. In native mode, the leading BOM
5590 mark is skipped, in all other modes, it is copied to the output
5591 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005592 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005593 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005594 if (bom == 0x0000FEFF) {
5595 bo = -1;
5596 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005598 else if (bom == 0xFFFE0000) {
5599 bo = 1;
5600 q += 4;
5601 }
5602 if (byteorder)
5603 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005604 }
5605
Victor Stinnere64322e2012-10-30 23:12:47 +01005606 if (q == e) {
5607 if (consumed)
5608 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005609 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005610 }
5611
Victor Stinnere64322e2012-10-30 23:12:47 +01005612#ifdef WORDS_BIGENDIAN
5613 le = bo < 0;
5614#else
5615 le = bo <= 0;
5616#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005617 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005618
Victor Stinner8f674cc2013-04-17 23:02:17 +02005619 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005620 writer.min_length = (e - q + 3) / 4;
5621 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005622 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005623
Victor Stinnere64322e2012-10-30 23:12:47 +01005624 while (1) {
5625 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005626 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005627
Victor Stinnere64322e2012-10-30 23:12:47 +01005628 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 enum PyUnicode_Kind kind = writer.kind;
5630 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005631 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005633 if (le) {
5634 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005635 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005636 if (ch > maxch)
5637 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005638 if (kind != PyUnicode_1BYTE_KIND &&
5639 Py_UNICODE_IS_SURROGATE(ch))
5640 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005642 q += 4;
5643 } while (q <= last);
5644 }
5645 else {
5646 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005647 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005648 if (ch > maxch)
5649 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 if (kind != PyUnicode_1BYTE_KIND &&
5651 Py_UNICODE_IS_SURROGATE(ch))
5652 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005653 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005654 q += 4;
5655 } while (q <= last);
5656 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005657 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005658 }
5659
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005660 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005661 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 startinpos = ((const char *)q) - starts;
5663 endinpos = startinpos + 4;
5664 }
5665 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005666 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005668 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005670 startinpos = ((const char *)q) - starts;
5671 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005673 else {
5674 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005675 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005676 goto onError;
5677 q += 4;
5678 continue;
5679 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005680 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005681 startinpos = ((const char *)q) - starts;
5682 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005684
5685 /* The remaining input chars are ignored if the callback
5686 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005687 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005689 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005691 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005693 }
5694
Walter Dörwald41980ca2007-08-16 21:55:45 +00005695 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005697
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698 Py_XDECREF(errorHandler);
5699 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005700 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005701
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005703 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005704 Py_XDECREF(errorHandler);
5705 Py_XDECREF(exc);
5706 return NULL;
5707}
5708
5709PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005710_PyUnicode_EncodeUTF32(PyObject *str,
5711 const char *errors,
5712 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005713{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005714 enum PyUnicode_Kind kind;
5715 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005716 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005717 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005718 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005719#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005720 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005721#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005722 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005723#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005725 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 PyObject *errorHandler = NULL;
5727 PyObject *exc = NULL;
5728 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005729
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005730 if (!PyUnicode_Check(str)) {
5731 PyErr_BadArgument();
5732 return NULL;
5733 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005734 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 return NULL;
5736 kind = PyUnicode_KIND(str);
5737 data = PyUnicode_DATA(str);
5738 len = PyUnicode_GET_LENGTH(str);
5739
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005740 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005741 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005742 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005743 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005744 if (v == NULL)
5745 return NULL;
5746
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005747 /* output buffer is 4-bytes aligned */
5748 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005749 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005750 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005751 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005753 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005754
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005755 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005756 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005757 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 else
5760 encoding = "utf-32";
5761
5762 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005763 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5764 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005765 }
5766
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005767 pos = 0;
5768 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005770
5771 if (kind == PyUnicode_2BYTE_KIND) {
5772 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5773 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005775 else {
5776 assert(kind == PyUnicode_4BYTE_KIND);
5777 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5778 &out, native_ordering);
5779 }
5780 if (pos == len)
5781 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005782
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 rep = unicode_encode_call_errorhandler(
5784 errors, &errorHandler,
5785 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005786 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787 if (!rep)
5788 goto error;
5789
5790 if (PyBytes_Check(rep)) {
5791 repsize = PyBytes_GET_SIZE(rep);
5792 if (repsize & 3) {
5793 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005794 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005795 "surrogates not allowed");
5796 goto error;
5797 }
5798 moreunits = repsize / 4;
5799 }
5800 else {
5801 assert(PyUnicode_Check(rep));
5802 if (PyUnicode_READY(rep) < 0)
5803 goto error;
5804 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5805 if (!PyUnicode_IS_ASCII(rep)) {
5806 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005807 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 "surrogates not allowed");
5809 goto error;
5810 }
5811 }
5812
5813 /* four bytes are reserved for each surrogate */
5814 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005815 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005816 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005817 /* integer overflow */
5818 PyErr_NoMemory();
5819 goto error;
5820 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005821 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005822 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005823 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005824 }
5825
5826 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005827 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005828 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005829 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005831 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5832 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 }
5834
5835 Py_CLEAR(rep);
5836 }
5837
5838 /* Cut back to size actually needed. This is necessary for, for example,
5839 encoding of a string containing isolated surrogates and the 'ignore'
5840 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005841 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 if (nsize != PyBytes_GET_SIZE(v))
5843 _PyBytes_Resize(&v, nsize);
5844 Py_XDECREF(errorHandler);
5845 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005846 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005847 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005848 error:
5849 Py_XDECREF(rep);
5850 Py_XDECREF(errorHandler);
5851 Py_XDECREF(exc);
5852 Py_XDECREF(v);
5853 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005854}
5855
Alexander Belopolsky40018472011-02-26 01:02:56 +00005856PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005857PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5858 Py_ssize_t size,
5859 const char *errors,
5860 int byteorder)
5861{
5862 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005863 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005864 if (tmp == NULL)
5865 return NULL;
5866 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5867 Py_DECREF(tmp);
5868 return result;
5869}
5870
5871PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005872PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005873{
Victor Stinnerb960b342011-11-20 19:12:52 +01005874 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005875}
5876
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877/* --- UTF-16 Codec ------------------------------------------------------- */
5878
Tim Peters772747b2001-08-09 22:21:55 +00005879PyObject *
5880PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 Py_ssize_t size,
5882 const char *errors,
5883 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884{
Walter Dörwald69652032004-09-07 20:24:22 +00005885 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5886}
5887
5888PyObject *
5889PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 Py_ssize_t size,
5891 const char *errors,
5892 int *byteorder,
5893 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005896 Py_ssize_t startinpos;
5897 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005898 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005899 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005900 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005901 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005902 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 PyObject *errorHandler = NULL;
5904 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005905 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
Andy Lestere6be9b52020-02-11 20:28:35 -06005907 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005908 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909
5910 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005911 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005913 /* Check for BOM marks (U+FEFF) in the input and adjust current
5914 byte order setting accordingly. In native mode, the leading BOM
5915 mark is skipped, in all other modes, it is copied to the output
5916 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005917 if (bo == 0 && size >= 2) {
5918 const Py_UCS4 bom = (q[1] << 8) | q[0];
5919 if (bom == 0xFEFF) {
5920 q += 2;
5921 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005923 else if (bom == 0xFFFE) {
5924 q += 2;
5925 bo = 1;
5926 }
5927 if (byteorder)
5928 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
Antoine Pitrou63065d72012-05-15 23:48:04 +02005931 if (q == e) {
5932 if (consumed)
5933 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005934 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005935 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005936
Christian Heimes743e0cd2012-10-17 23:52:17 +02005937#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005938 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005939 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005940#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005941 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005942 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005943#endif
Tim Peters772747b2001-08-09 22:21:55 +00005944
Antoine Pitrou63065d72012-05-15 23:48:04 +02005945 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005946 character count normally. Error handler will take care of
5947 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005948 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005949 writer.min_length = (e - q + 1) / 2;
5950 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005951 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005952
Antoine Pitrou63065d72012-05-15 23:48:04 +02005953 while (1) {
5954 Py_UCS4 ch = 0;
5955 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005956 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005957 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005958 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005959 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005960 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005961 native_ordering);
5962 else
5963 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005964 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005965 native_ordering);
5966 } else if (kind == PyUnicode_2BYTE_KIND) {
5967 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005968 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005969 native_ordering);
5970 } else {
5971 assert(kind == PyUnicode_4BYTE_KIND);
5972 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005973 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005974 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005975 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005976 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977
Antoine Pitrou63065d72012-05-15 23:48:04 +02005978 switch (ch)
5979 {
5980 case 0:
5981 /* remaining byte at the end? (size should be even) */
5982 if (q == e || consumed)
5983 goto End;
5984 errmsg = "truncated data";
5985 startinpos = ((const char *)q) - starts;
5986 endinpos = ((const char *)e) - starts;
5987 break;
5988 /* The remaining input chars are ignored if the callback
5989 chooses to skip the input */
5990 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005991 q -= 2;
5992 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005993 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005994 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005995 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005996 endinpos = ((const char *)e) - starts;
5997 break;
5998 case 2:
5999 errmsg = "illegal encoding";
6000 startinpos = ((const char *)q) - 2 - starts;
6001 endinpos = startinpos + 2;
6002 break;
6003 case 3:
6004 errmsg = "illegal UTF-16 surrogate";
6005 startinpos = ((const char *)q) - 4 - starts;
6006 endinpos = startinpos + 2;
6007 break;
6008 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006009 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006010 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 continue;
6012 }
6013
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006014 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006015 errors,
6016 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006017 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006018 &starts,
6019 (const char **)&e,
6020 &startinpos,
6021 &endinpos,
6022 &exc,
6023 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006024 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 }
6027
Antoine Pitrou63065d72012-05-15 23:48:04 +02006028End:
Walter Dörwald69652032004-09-07 20:24:22 +00006029 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 Py_XDECREF(errorHandler);
6033 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006034 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006037 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 Py_XDECREF(errorHandler);
6039 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 return NULL;
6041}
6042
Tim Peters772747b2001-08-09 22:21:55 +00006043PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044_PyUnicode_EncodeUTF16(PyObject *str,
6045 const char *errors,
6046 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006048 enum PyUnicode_Kind kind;
6049 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006050 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006051 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006052 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006053 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006054#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006055 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006056#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006057 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006058#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006059 const char *encoding;
6060 Py_ssize_t nsize, pos;
6061 PyObject *errorHandler = NULL;
6062 PyObject *exc = NULL;
6063 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006064
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065 if (!PyUnicode_Check(str)) {
6066 PyErr_BadArgument();
6067 return NULL;
6068 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006069 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006070 return NULL;
6071 kind = PyUnicode_KIND(str);
6072 data = PyUnicode_DATA(str);
6073 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006074
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006075 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006076 if (kind == PyUnicode_4BYTE_KIND) {
6077 const Py_UCS4 *in = (const Py_UCS4 *)data;
6078 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006079 while (in < end) {
6080 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006081 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006082 }
6083 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006084 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006085 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006087 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006088 nsize = len + pairs + (byteorder == 0);
6089 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006090 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006094 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006095 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006096 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006097 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006098 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006099 }
6100 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006101 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006102 }
Tim Peters772747b2001-08-09 22:21:55 +00006103
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006104 if (kind == PyUnicode_1BYTE_KIND) {
6105 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6106 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006107 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006108
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006109 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006110 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006111 }
6112 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006113 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006114 }
6115 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006116 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006117 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006118
6119 pos = 0;
6120 while (pos < len) {
6121 Py_ssize_t repsize, moreunits;
6122
6123 if (kind == PyUnicode_2BYTE_KIND) {
6124 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6125 &out, native_ordering);
6126 }
6127 else {
6128 assert(kind == PyUnicode_4BYTE_KIND);
6129 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6130 &out, native_ordering);
6131 }
6132 if (pos == len)
6133 break;
6134
6135 rep = unicode_encode_call_errorhandler(
6136 errors, &errorHandler,
6137 encoding, "surrogates not allowed",
6138 str, &exc, pos, pos + 1, &pos);
6139 if (!rep)
6140 goto error;
6141
6142 if (PyBytes_Check(rep)) {
6143 repsize = PyBytes_GET_SIZE(rep);
6144 if (repsize & 1) {
6145 raise_encode_exception(&exc, encoding,
6146 str, pos - 1, pos,
6147 "surrogates not allowed");
6148 goto error;
6149 }
6150 moreunits = repsize / 2;
6151 }
6152 else {
6153 assert(PyUnicode_Check(rep));
6154 if (PyUnicode_READY(rep) < 0)
6155 goto error;
6156 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6157 if (!PyUnicode_IS_ASCII(rep)) {
6158 raise_encode_exception(&exc, encoding,
6159 str, pos - 1, pos,
6160 "surrogates not allowed");
6161 goto error;
6162 }
6163 }
6164
6165 /* two bytes are reserved for each surrogate */
6166 if (moreunits > 1) {
6167 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006168 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006169 /* integer overflow */
6170 PyErr_NoMemory();
6171 goto error;
6172 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006173 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006174 goto error;
6175 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6176 }
6177
6178 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006179 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006180 out += moreunits;
6181 } else /* rep is unicode */ {
6182 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6183 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6184 &out, native_ordering);
6185 }
6186
6187 Py_CLEAR(rep);
6188 }
6189
6190 /* Cut back to size actually needed. This is necessary for, for example,
6191 encoding of a string containing isolated surrogates and the 'ignore' handler
6192 is used. */
6193 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6194 if (nsize != PyBytes_GET_SIZE(v))
6195 _PyBytes_Resize(&v, nsize);
6196 Py_XDECREF(errorHandler);
6197 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006198 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006199 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006200 error:
6201 Py_XDECREF(rep);
6202 Py_XDECREF(errorHandler);
6203 Py_XDECREF(exc);
6204 Py_XDECREF(v);
6205 return NULL;
6206#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207}
6208
Alexander Belopolsky40018472011-02-26 01:02:56 +00006209PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006210PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6211 Py_ssize_t size,
6212 const char *errors,
6213 int byteorder)
6214{
6215 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006216 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 if (tmp == NULL)
6218 return NULL;
6219 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6220 Py_DECREF(tmp);
6221 return result;
6222}
6223
6224PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228}
6229
6230/* --- Unicode Escape Codec ----------------------------------------------- */
6231
Fredrik Lundh06d12682001-01-24 07:59:11 +00006232static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006233
Alexander Belopolsky40018472011-02-26 01:02:56 +00006234PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006235_PyUnicode_DecodeUnicodeEscape(const char *s,
6236 Py_ssize_t size,
6237 const char *errors,
6238 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006240 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006241 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006243 PyObject *errorHandler = NULL;
6244 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006245
Eric V. Smith42454af2016-10-31 09:22:08 -04006246 // so we can remember if we've seen an invalid escape char or not
6247 *first_invalid_escape = NULL;
6248
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006250 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 }
6252 /* Escaped strings will always be longer than the resulting
6253 Unicode string, so we start with size here and then reduce the
6254 length after conversion to the true value.
6255 (but if the error callback returns a long replacement string
6256 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006257 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 writer.min_length = size;
6259 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6260 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006261 }
6262
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 end = s + size;
6264 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 unsigned char c = (unsigned char) *s++;
6266 Py_UCS4 ch;
6267 int count;
6268 Py_ssize_t startinpos;
6269 Py_ssize_t endinpos;
6270 const char *message;
6271
6272#define WRITE_ASCII_CHAR(ch) \
6273 do { \
6274 assert(ch <= 127); \
6275 assert(writer.pos < writer.size); \
6276 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6277 } while(0)
6278
6279#define WRITE_CHAR(ch) \
6280 do { \
6281 if (ch <= writer.maxchar) { \
6282 assert(writer.pos < writer.size); \
6283 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6284 } \
6285 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6286 goto onError; \
6287 } \
6288 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
6290 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006291 if (c != '\\') {
6292 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 continue;
6294 }
6295
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 if (s >= end) {
6299 message = "\\ at end of string";
6300 goto error;
6301 }
6302 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006303
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006305 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006308 case '\n': continue;
6309 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6310 case '\'': WRITE_ASCII_CHAR('\''); continue;
6311 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6312 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006313 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6315 case 't': WRITE_ASCII_CHAR('\t'); continue;
6316 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6317 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006318 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006320 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 case '0': case '1': case '2': case '3':
6325 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006327 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006328 ch = (ch<<3) + *s++ - '0';
6329 if (s < end && '0' <= *s && *s <= '7') {
6330 ch = (ch<<3) + *s++ - '0';
6331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 WRITE_CHAR(ch);
6334 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335
Benjamin Peterson29060642009-01-31 22:14:21 +00006336 /* hex escapes */
6337 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006340 message = "truncated \\xXX escape";
6341 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006346 message = "truncated \\uXXXX escape";
6347 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006350 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006352 message = "truncated \\UXXXXXXXX escape";
6353 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006355 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 ch <<= 4;
6357 if (c >= '0' && c <= '9') {
6358 ch += c - '0';
6359 }
6360 else if (c >= 'a' && c <= 'f') {
6361 ch += c - ('a' - 10);
6362 }
6363 else if (c >= 'A' && c <= 'F') {
6364 ch += c - ('A' - 10);
6365 }
6366 else {
6367 break;
6368 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006369 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006370 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006371 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 }
6373
6374 /* when we get here, ch is a 32-bit unicode character */
6375 if (ch > MAX_UNICODE) {
6376 message = "illegal Unicode character";
6377 goto error;
6378 }
6379
6380 WRITE_CHAR(ch);
6381 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006382
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006384 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006385 if (ucnhash_CAPI == NULL) {
6386 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006387 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6388 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 if (ucnhash_CAPI == NULL) {
6390 PyErr_SetString(
6391 PyExc_UnicodeError,
6392 "\\N escapes not supported (can't load unicodedata module)"
6393 );
6394 goto onError;
6395 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006396 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006397
6398 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006399 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 const char *start = ++s;
6401 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006402 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006404 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006405 namelen = s - start;
6406 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006407 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006408 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 ch = 0xffffffff; /* in case 'getcode' messes up */
6410 if (namelen <= INT_MAX &&
6411 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6412 &ch, 0)) {
6413 assert(ch <= MAX_UNICODE);
6414 WRITE_CHAR(ch);
6415 continue;
6416 }
6417 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006418 }
6419 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006420 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006421
6422 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006423 if (*first_invalid_escape == NULL) {
6424 *first_invalid_escape = s-1; /* Back up one char, since we've
6425 already incremented s. */
6426 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 WRITE_ASCII_CHAR('\\');
6428 WRITE_CHAR(c);
6429 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006431
6432 error:
6433 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006435 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006436 errors, &errorHandler,
6437 "unicodeescape", message,
6438 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006439 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006440 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006442 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006443
6444#undef WRITE_ASCII_CHAR
6445#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006447
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006448 Py_XDECREF(errorHandler);
6449 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006450 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006451
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006453 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 Py_XDECREF(errorHandler);
6455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 return NULL;
6457}
6458
Eric V. Smith42454af2016-10-31 09:22:08 -04006459PyObject *
6460PyUnicode_DecodeUnicodeEscape(const char *s,
6461 Py_ssize_t size,
6462 const char *errors)
6463{
6464 const char *first_invalid_escape;
6465 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6466 &first_invalid_escape);
6467 if (result == NULL)
6468 return NULL;
6469 if (first_invalid_escape != NULL) {
6470 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6471 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006472 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006473 Py_DECREF(result);
6474 return NULL;
6475 }
6476 }
6477 return result;
6478}
6479
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006480/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006483PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006485 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006488 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006489 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491
Ezio Melottie7f90372012-10-05 03:33:31 +03006492 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006493 escape.
6494
Ezio Melottie7f90372012-10-05 03:33:31 +03006495 For UCS1 strings it's '\xxx', 4 bytes per source character.
6496 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6497 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006498 */
6499
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006500 if (!PyUnicode_Check(unicode)) {
6501 PyErr_BadArgument();
6502 return NULL;
6503 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006505 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 }
Victor Stinner358af132015-10-12 22:36:57 +02006507
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006508 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006509 if (len == 0) {
6510 return PyBytes_FromStringAndSize(NULL, 0);
6511 }
6512
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006513 kind = PyUnicode_KIND(unicode);
6514 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6516 bytes, and 1 byte characters 4. */
6517 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006518 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 return PyErr_NoMemory();
6520 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006521 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 if (repr == NULL) {
6523 return NULL;
6524 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006525
Victor Stinner62ec3312016-09-06 17:04:34 -07006526 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006527 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006528 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006529
Victor Stinner62ec3312016-09-06 17:04:34 -07006530 /* U+0000-U+00ff range */
6531 if (ch < 0x100) {
6532 if (ch >= ' ' && ch < 127) {
6533 if (ch != '\\') {
6534 /* Copy printable US ASCII as-is */
6535 *p++ = (char) ch;
6536 }
6537 /* Escape backslashes */
6538 else {
6539 *p++ = '\\';
6540 *p++ = '\\';
6541 }
6542 }
Victor Stinner358af132015-10-12 22:36:57 +02006543
Victor Stinner62ec3312016-09-06 17:04:34 -07006544 /* Map special whitespace to '\t', \n', '\r' */
6545 else if (ch == '\t') {
6546 *p++ = '\\';
6547 *p++ = 't';
6548 }
6549 else if (ch == '\n') {
6550 *p++ = '\\';
6551 *p++ = 'n';
6552 }
6553 else if (ch == '\r') {
6554 *p++ = '\\';
6555 *p++ = 'r';
6556 }
6557
6558 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6559 else {
6560 *p++ = '\\';
6561 *p++ = 'x';
6562 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6563 *p++ = Py_hexdigits[ch & 0x000F];
6564 }
Tim Petersced69f82003-09-16 20:30:58 +00006565 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006566 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006567 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 *p++ = '\\';
6569 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006570 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6571 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6572 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6573 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006575 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6576 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006577
Victor Stinner62ec3312016-09-06 17:04:34 -07006578 /* Make sure that the first two digits are zero */
6579 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006580 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 *p++ = 'U';
6582 *p++ = '0';
6583 *p++ = '0';
6584 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6585 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6586 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6587 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6588 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6589 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592
Victor Stinner62ec3312016-09-06 17:04:34 -07006593 assert(p - PyBytes_AS_STRING(repr) > 0);
6594 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6595 return NULL;
6596 }
6597 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598}
6599
Alexander Belopolsky40018472011-02-26 01:02:56 +00006600PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006601PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6602 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006604 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006605 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006606 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006608 }
6609
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006610 result = PyUnicode_AsUnicodeEscapeString(tmp);
6611 Py_DECREF(tmp);
6612 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613}
6614
6615/* --- Raw Unicode Escape Codec ------------------------------------------- */
6616
Alexander Belopolsky40018472011-02-26 01:02:56 +00006617PyObject *
6618PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006619 Py_ssize_t size,
6620 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006623 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 PyObject *errorHandler = NULL;
6626 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006627
Victor Stinner62ec3312016-09-06 17:04:34 -07006628 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006629 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006631
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 /* Escaped strings will always be longer than the resulting
6633 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006634 length after conversion to the true value. (But decoding error
6635 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006636 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006637 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006638 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6639 goto onError;
6640 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006641
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 end = s + size;
6643 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006644 unsigned char c = (unsigned char) *s++;
6645 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006646 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 Py_ssize_t startinpos;
6648 Py_ssize_t endinpos;
6649 const char *message;
6650
6651#define WRITE_CHAR(ch) \
6652 do { \
6653 if (ch <= writer.maxchar) { \
6654 assert(writer.pos < writer.size); \
6655 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6656 } \
6657 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6658 goto onError; \
6659 } \
6660 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006663 if (c != '\\' || s >= end) {
6664 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006667
Victor Stinner62ec3312016-09-06 17:04:34 -07006668 c = (unsigned char) *s++;
6669 if (c == 'u') {
6670 count = 4;
6671 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006673 else if (c == 'U') {
6674 count = 8;
6675 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006676 }
6677 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006678 assert(writer.pos < writer.size);
6679 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6680 WRITE_CHAR(c);
6681 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006682 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 startinpos = s - starts - 2;
6684
6685 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6686 for (ch = 0; count && s < end; ++s, --count) {
6687 c = (unsigned char)*s;
6688 ch <<= 4;
6689 if (c >= '0' && c <= '9') {
6690 ch += c - '0';
6691 }
6692 else if (c >= 'a' && c <= 'f') {
6693 ch += c - ('a' - 10);
6694 }
6695 else if (c >= 'A' && c <= 'F') {
6696 ch += c - ('A' - 10);
6697 }
6698 else {
6699 break;
6700 }
6701 }
6702 if (!count) {
6703 if (ch <= MAX_UNICODE) {
6704 WRITE_CHAR(ch);
6705 continue;
6706 }
6707 message = "\\Uxxxxxxxx out of range";
6708 }
6709
6710 endinpos = s-starts;
6711 writer.min_length = end - s + writer.pos;
6712 if (unicode_decode_call_errorhandler_writer(
6713 errors, &errorHandler,
6714 "rawunicodeescape", message,
6715 &starts, &end, &startinpos, &endinpos, &exc, &s,
6716 &writer)) {
6717 goto onError;
6718 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006719 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006720
6721#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 Py_XDECREF(errorHandler);
6724 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006726
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006728 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 Py_XDECREF(errorHandler);
6730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006732
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733}
6734
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006737PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
Victor Stinner62ec3312016-09-06 17:04:34 -07006739 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006741 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006742 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006743 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006744 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006746 if (!PyUnicode_Check(unicode)) {
6747 PyErr_BadArgument();
6748 return NULL;
6749 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006750 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006751 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006752 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006753 kind = PyUnicode_KIND(unicode);
6754 data = PyUnicode_DATA(unicode);
6755 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006756 if (kind == PyUnicode_1BYTE_KIND) {
6757 return PyBytes_FromStringAndSize(data, len);
6758 }
Victor Stinner0e368262011-11-10 20:12:49 +01006759
Victor Stinner62ec3312016-09-06 17:04:34 -07006760 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6761 bytes, and 1 byte characters 4. */
6762 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006763
Victor Stinner62ec3312016-09-06 17:04:34 -07006764 if (len > PY_SSIZE_T_MAX / expandsize) {
6765 return PyErr_NoMemory();
6766 }
6767 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6768 if (repr == NULL) {
6769 return NULL;
6770 }
6771 if (len == 0) {
6772 return repr;
6773 }
6774
6775 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006776 for (pos = 0; pos < len; pos++) {
6777 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006778
Victor Stinner62ec3312016-09-06 17:04:34 -07006779 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6780 if (ch < 0x100) {
6781 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006782 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006783 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006784 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 *p++ = '\\';
6786 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006787 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6788 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6789 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6790 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006792 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6793 else {
6794 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6795 *p++ = '\\';
6796 *p++ = 'U';
6797 *p++ = '0';
6798 *p++ = '0';
6799 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6800 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6801 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6802 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6803 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6804 *p++ = Py_hexdigits[ch & 15];
6805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006807
Victor Stinner62ec3312016-09-06 17:04:34 -07006808 assert(p > PyBytes_AS_STRING(repr));
6809 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6810 return NULL;
6811 }
6812 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813}
6814
Alexander Belopolsky40018472011-02-26 01:02:56 +00006815PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006816PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6817 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006819 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006820 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006821 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006822 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006823 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6824 Py_DECREF(tmp);
6825 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826}
6827
6828/* --- Latin-1 Codec ------------------------------------------------------ */
6829
Alexander Belopolsky40018472011-02-26 01:02:56 +00006830PyObject *
6831PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006832 Py_ssize_t size,
6833 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006836 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837}
6838
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006839/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006840static void
6841make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006842 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006843 PyObject *unicode,
6844 Py_ssize_t startpos, Py_ssize_t endpos,
6845 const char *reason)
6846{
6847 if (*exceptionObject == NULL) {
6848 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006849 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006850 encoding, unicode, startpos, endpos, reason);
6851 }
6852 else {
6853 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6854 goto onError;
6855 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6856 goto onError;
6857 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6858 goto onError;
6859 return;
6860 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006861 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006862 }
6863}
6864
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006865/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866static void
6867raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006868 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006869 PyObject *unicode,
6870 Py_ssize_t startpos, Py_ssize_t endpos,
6871 const char *reason)
6872{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006873 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006874 encoding, unicode, startpos, endpos, reason);
6875 if (*exceptionObject != NULL)
6876 PyCodec_StrictErrors(*exceptionObject);
6877}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878
6879/* error handling callback helper:
6880 build arguments, call the callback and check the arguments,
6881 put the result into newpos and return the replacement string, which
6882 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006883static PyObject *
6884unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006885 PyObject **errorHandler,
6886 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006888 Py_ssize_t startpos, Py_ssize_t endpos,
6889 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006890{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006891 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006892 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 PyObject *restuple;
6894 PyObject *resunicode;
6895
6896 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 }
6901
Benjamin Petersonbac79492012-01-14 13:34:47 -05006902 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006903 return NULL;
6904 len = PyUnicode_GET_LENGTH(unicode);
6905
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006906 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006907 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910
Petr Viktorinffd97532020-02-11 17:46:57 +01006911 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006915 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 Py_DECREF(restuple);
6917 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006918 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006919 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 &resunicode, newpos)) {
6921 Py_DECREF(restuple);
6922 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006924 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6925 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6926 Py_DECREF(restuple);
6927 return NULL;
6928 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930 *newpos = len + *newpos;
6931 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006932 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 Py_DECREF(restuple);
6934 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006935 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 Py_INCREF(resunicode);
6937 Py_DECREF(restuple);
6938 return resunicode;
6939}
6940
Alexander Belopolsky40018472011-02-26 01:02:56 +00006941static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006942unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006943 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006944 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006945{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006946 /* input state */
6947 Py_ssize_t pos=0, size;
6948 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006949 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 /* pointer into the output */
6951 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006952 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6953 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006954 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006956 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006957 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006958 /* output object */
6959 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960
Benjamin Petersonbac79492012-01-14 13:34:47 -05006961 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006962 return NULL;
6963 size = PyUnicode_GET_LENGTH(unicode);
6964 kind = PyUnicode_KIND(unicode);
6965 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966 /* allocate enough for a simple encoding without
6967 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006968 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006969 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006970
6971 _PyBytesWriter_Init(&writer);
6972 str = _PyBytesWriter_Alloc(&writer, size);
6973 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006974 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006975
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006976 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006977 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006978
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006980 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006982 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006983 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006986 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006988 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006989 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006991
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006992 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006994
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006995 /* Only overallocate the buffer if it's not the last write */
6996 writer.overallocate = (collend < size);
6997
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006999 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007000 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007001
7002 switch (error_handler) {
7003 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007004 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007006
7007 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007008 memset(str, '?', collend - collstart);
7009 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007010 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007011 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007012 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 break;
Victor Stinner50149202015-09-22 00:26:54 +02007014
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007015 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007016 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007017 writer.min_size -= (collend - collstart);
7018 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007019 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007020 if (str == NULL)
7021 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007022 pos = collend;
7023 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007024
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007025 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007026 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007027 writer.min_size -= (collend - collstart);
7028 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007029 unicode, collstart, collend);
7030 if (str == NULL)
7031 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 break;
Victor Stinner50149202015-09-22 00:26:54 +02007034
Victor Stinnerc3713e92015-09-29 12:32:13 +02007035 case _Py_ERROR_SURROGATEESCAPE:
7036 for (i = collstart; i < collend; ++i) {
7037 ch = PyUnicode_READ(kind, data, i);
7038 if (ch < 0xdc80 || 0xdcff < ch) {
7039 /* Not a UTF-8b surrogate */
7040 break;
7041 }
7042 *str++ = (char)(ch - 0xdc00);
7043 ++pos;
7044 }
7045 if (i >= collend)
7046 break;
7047 collstart = pos;
7048 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007049 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007050
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007052 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7053 encoding, reason, unicode, &exc,
7054 collstart, collend, &newpos);
7055 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007057
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007058 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007059 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007060
Victor Stinner6bd525b2015-10-09 13:10:05 +02007061 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007062 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007063 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007064 PyBytes_AS_STRING(rep),
7065 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007066 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007067 else {
7068 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007069
Victor Stinner6bd525b2015-10-09 13:10:05 +02007070 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007072
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007073 if (limit == 256 ?
7074 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7075 !PyUnicode_IS_ASCII(rep))
7076 {
7077 /* Not all characters are smaller than limit */
7078 raise_encode_exception(&exc, encoding, unicode,
7079 collstart, collend, reason);
7080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007081 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007082 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7083 str = _PyBytesWriter_WriteBytes(&writer, str,
7084 PyUnicode_DATA(rep),
7085 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007087 if (str == NULL)
7088 goto onError;
7089
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007090 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007091 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007092 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007093
7094 /* If overallocation was disabled, ensure that it was the last
7095 write. Otherwise, we missed an optimization */
7096 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007097 }
7098 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007099
Victor Stinner50149202015-09-22 00:26:54 +02007100 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007101 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007102 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007103
7104 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007105 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007106 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007107 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007108 Py_XDECREF(exc);
7109 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007110}
7111
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007112/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113PyObject *
7114PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007115 Py_ssize_t size,
7116 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007118 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007119 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007120 if (unicode == NULL)
7121 return NULL;
7122 result = unicode_encode_ucs1(unicode, errors, 256);
7123 Py_DECREF(unicode);
7124 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125}
7126
Alexander Belopolsky40018472011-02-26 01:02:56 +00007127PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007128_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129{
7130 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 PyErr_BadArgument();
7132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007134 if (PyUnicode_READY(unicode) == -1)
7135 return NULL;
7136 /* Fast path: if it is a one-byte string, construct
7137 bytes object directly. */
7138 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7139 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7140 PyUnicode_GET_LENGTH(unicode));
7141 /* Non-Latin-1 characters present. Defer to above function to
7142 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007143 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007144}
7145
7146PyObject*
7147PyUnicode_AsLatin1String(PyObject *unicode)
7148{
7149 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150}
7151
7152/* --- 7-bit ASCII Codec -------------------------------------------------- */
7153
Alexander Belopolsky40018472011-02-26 01:02:56 +00007154PyObject *
7155PyUnicode_DecodeASCII(const char *s,
7156 Py_ssize_t size,
7157 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007159 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007160 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007161 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007162 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007163 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007164
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007166 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007169 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007170 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172
Inada Naoki770847a2019-06-24 12:30:24 +09007173 // Shortcut for simple case
7174 PyObject *u = PyUnicode_New(size, 127);
7175 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007176 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007177 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007178 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007179 if (outpos == size) {
7180 return u;
7181 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007182
Inada Naoki770847a2019-06-24 12:30:24 +09007183 _PyUnicodeWriter writer;
7184 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007185 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007186
Inada Naoki770847a2019-06-24 12:30:24 +09007187 s += outpos;
7188 int kind = writer.kind;
7189 void *data = writer.data;
7190 Py_ssize_t startinpos, endinpos;
7191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007193 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007195 PyUnicode_WRITE(kind, data, writer.pos, c);
7196 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007198 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007200
7201 /* byte outsize range 0x00..0x7f: call the error handler */
7202
7203 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007204 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007205
7206 switch (error_handler)
7207 {
7208 case _Py_ERROR_REPLACE:
7209 case _Py_ERROR_SURROGATEESCAPE:
7210 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007211 but we may switch to UCS2 at the first write */
7212 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7213 goto onError;
7214 kind = writer.kind;
7215 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007216
7217 if (error_handler == _Py_ERROR_REPLACE)
7218 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7219 else
7220 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7221 writer.pos++;
7222 ++s;
7223 break;
7224
7225 case _Py_ERROR_IGNORE:
7226 ++s;
7227 break;
7228
7229 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 startinpos = s-starts;
7231 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007232 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007233 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 "ascii", "ordinal not in range(128)",
7235 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007236 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007238 kind = writer.kind;
7239 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007242 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007244 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007245
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007247 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007248 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250 return NULL;
7251}
7252
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007253/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007254PyObject *
7255PyUnicode_EncodeASCII(const Py_UNICODE *p,
7256 Py_ssize_t size,
7257 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007259 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007260 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007261 if (unicode == NULL)
7262 return NULL;
7263 result = unicode_encode_ucs1(unicode, errors, 128);
7264 Py_DECREF(unicode);
7265 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266}
7267
Alexander Belopolsky40018472011-02-26 01:02:56 +00007268PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007269_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270{
7271 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 PyErr_BadArgument();
7273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007275 if (PyUnicode_READY(unicode) == -1)
7276 return NULL;
7277 /* Fast path: if it is an ASCII-only string, construct bytes object
7278 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007279 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7281 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007282 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007283}
7284
7285PyObject *
7286PyUnicode_AsASCIIString(PyObject *unicode)
7287{
7288 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007289}
7290
Steve Dowercc16be82016-09-08 10:35:16 -07007291#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007292
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007293/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007294
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007295#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007296#define NEED_RETRY
7297#endif
7298
Steve Dower7ebdda02019-08-21 16:22:33 -07007299/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7300 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7301 both cases also and avoids partial characters overrunning the
7302 length limit in MultiByteToWideChar on Windows */
7303#define DECODING_CHUNK_SIZE (INT_MAX/4)
7304
Victor Stinner3a50e702011-10-18 21:21:00 +02007305#ifndef WC_ERR_INVALID_CHARS
7306# define WC_ERR_INVALID_CHARS 0x0080
7307#endif
7308
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007309static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007310code_page_name(UINT code_page, PyObject **obj)
7311{
7312 *obj = NULL;
7313 if (code_page == CP_ACP)
7314 return "mbcs";
7315 if (code_page == CP_UTF7)
7316 return "CP_UTF7";
7317 if (code_page == CP_UTF8)
7318 return "CP_UTF8";
7319
7320 *obj = PyBytes_FromFormat("cp%u", code_page);
7321 if (*obj == NULL)
7322 return NULL;
7323 return PyBytes_AS_STRING(*obj);
7324}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325
Victor Stinner3a50e702011-10-18 21:21:00 +02007326static DWORD
7327decode_code_page_flags(UINT code_page)
7328{
7329 if (code_page == CP_UTF7) {
7330 /* The CP_UTF7 decoder only supports flags=0 */
7331 return 0;
7332 }
7333 else
7334 return MB_ERR_INVALID_CHARS;
7335}
7336
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 * Decode a byte string from a Windows code page into unicode object in strict
7339 * mode.
7340 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007341 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7342 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007344static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007345decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007346 wchar_t **buf,
7347 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 const char *in,
7349 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007351 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007352 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354
7355 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007357 while ((outsize = MultiByteToWideChar(code_page, flags,
7358 in, insize, NULL, 0)) <= 0)
7359 {
7360 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7361 goto error;
7362 }
7363 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7364 flags = 0;
7365 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007367 /* Extend a wchar_t* buffer */
7368 Py_ssize_t n = *bufsize; /* Get the current length */
7369 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7370 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007372 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007373
7374 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7376 if (outsize <= 0)
7377 goto error;
7378 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007379
Victor Stinner3a50e702011-10-18 21:21:00 +02007380error:
7381 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7382 return -2;
7383 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007384 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007385}
7386
Victor Stinner3a50e702011-10-18 21:21:00 +02007387/*
7388 * Decode a byte string from a code page into unicode object with an error
7389 * handler.
7390 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007391 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 * UnicodeDecodeError exception and returns -1 on error.
7393 */
7394static int
7395decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007396 wchar_t **buf,
7397 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007398 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007399 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007400{
7401 const char *startin = in;
7402 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007403 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 /* Ideally, we should get reason from FormatMessage. This is the Windows
7405 2000 English version of the message. */
7406 const char *reason = "No mapping for the Unicode character exists "
7407 "in the target code page.";
7408 /* each step cannot decode more than 1 character, but a character can be
7409 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007410 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007411 int insize;
7412 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 PyObject *errorHandler = NULL;
7414 PyObject *exc = NULL;
7415 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007416 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 DWORD err;
7418 int ret = -1;
7419
7420 assert(size > 0);
7421
7422 encoding = code_page_name(code_page, &encoding_obj);
7423 if (encoding == NULL)
7424 return -1;
7425
Victor Stinner7d00cc12014-03-17 23:08:06 +01007426 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7428 UnicodeDecodeError. */
7429 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7430 if (exc != NULL) {
7431 PyCodec_StrictErrors(exc);
7432 Py_CLEAR(exc);
7433 }
7434 goto error;
7435 }
7436
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007437 /* Extend a wchar_t* buffer */
7438 Py_ssize_t n = *bufsize; /* Get the current length */
7439 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7440 PyErr_NoMemory();
7441 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007443 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7444 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007446 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007447
7448 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 while (in < endin)
7450 {
7451 /* Decode a character */
7452 insize = 1;
7453 do
7454 {
7455 outsize = MultiByteToWideChar(code_page, flags,
7456 in, insize,
7457 buffer, Py_ARRAY_LENGTH(buffer));
7458 if (outsize > 0)
7459 break;
7460 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007461 if (err == ERROR_INVALID_FLAGS && flags) {
7462 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7463 flags = 0;
7464 continue;
7465 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 if (err != ERROR_NO_UNICODE_TRANSLATION
7467 && err != ERROR_INSUFFICIENT_BUFFER)
7468 {
7469 PyErr_SetFromWindowsErr(0);
7470 goto error;
7471 }
7472 insize++;
7473 }
7474 /* 4=maximum length of a UTF-8 sequence */
7475 while (insize <= 4 && (in + insize) <= endin);
7476
7477 if (outsize <= 0) {
7478 Py_ssize_t startinpos, endinpos, outpos;
7479
Victor Stinner7d00cc12014-03-17 23:08:06 +01007480 /* last character in partial decode? */
7481 if (in + insize >= endin && !final)
7482 break;
7483
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 startinpos = in - startin;
7485 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007486 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007487 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 errors, &errorHandler,
7489 encoding, reason,
7490 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007491 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 {
7493 goto error;
7494 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007495 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 }
7497 else {
7498 in += insize;
7499 memcpy(out, buffer, outsize * sizeof(wchar_t));
7500 out += outsize;
7501 }
7502 }
7503
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007504 /* Shrink the buffer */
7505 assert(out - *buf <= *bufsize);
7506 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007507 /* (in - startin) <= size and size is an int */
7508 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007509
7510error:
7511 Py_XDECREF(encoding_obj);
7512 Py_XDECREF(errorHandler);
7513 Py_XDECREF(exc);
7514 return ret;
7515}
7516
Victor Stinner3a50e702011-10-18 21:21:00 +02007517static PyObject *
7518decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007519 const char *s, Py_ssize_t size,
7520 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007522 wchar_t *buf = NULL;
7523 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007524 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 if (code_page < 0) {
7527 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7528 return NULL;
7529 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007530 if (size < 0) {
7531 PyErr_BadInternalCall();
7532 return NULL;
7533 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007534
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537
Victor Stinner76a31a62011-11-04 00:05:13 +01007538 do
7539 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007540#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007541 if (size > DECODING_CHUNK_SIZE) {
7542 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007543 final = 0;
7544 done = 0;
7545 }
7546 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007547#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007548 {
7549 chunk_size = (int)size;
7550 final = (consumed == NULL);
7551 done = 1;
7552 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007553
Victor Stinner76a31a62011-11-04 00:05:13 +01007554 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007555 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007556 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007557 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007558 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007559
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007560 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007561 s, chunk_size);
7562 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007563 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007564 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007565 errors, final);
7566 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007567
7568 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007569 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007570 return NULL;
7571 }
7572
7573 if (consumed)
7574 *consumed += converted;
7575
7576 s += converted;
7577 size -= converted;
7578 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007579
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007580 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7581 PyMem_Free(buf);
7582 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007583}
7584
Alexander Belopolsky40018472011-02-26 01:02:56 +00007585PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007586PyUnicode_DecodeCodePageStateful(int code_page,
7587 const char *s,
7588 Py_ssize_t size,
7589 const char *errors,
7590 Py_ssize_t *consumed)
7591{
7592 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7593}
7594
7595PyObject *
7596PyUnicode_DecodeMBCSStateful(const char *s,
7597 Py_ssize_t size,
7598 const char *errors,
7599 Py_ssize_t *consumed)
7600{
7601 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7602}
7603
7604PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007605PyUnicode_DecodeMBCS(const char *s,
7606 Py_ssize_t size,
7607 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007608{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007609 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7610}
7611
Victor Stinner3a50e702011-10-18 21:21:00 +02007612static DWORD
7613encode_code_page_flags(UINT code_page, const char *errors)
7614{
7615 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007616 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 }
7618 else if (code_page == CP_UTF7) {
7619 /* CP_UTF7 only supports flags=0 */
7620 return 0;
7621 }
7622 else {
7623 if (errors != NULL && strcmp(errors, "replace") == 0)
7624 return 0;
7625 else
7626 return WC_NO_BEST_FIT_CHARS;
7627 }
7628}
7629
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 * Encode a Unicode string to a Windows code page into a byte string in strict
7632 * mode.
7633 *
7634 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007635 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007637static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007638encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007641{
Victor Stinner554f3f02010-06-16 23:33:54 +00007642 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 BOOL *pusedDefaultChar = &usedDefaultChar;
7644 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007645 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007646 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 const DWORD flags = encode_code_page_flags(code_page, NULL);
7648 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007649 /* Create a substring so that we can get the UTF-16 representation
7650 of just the slice under consideration. */
7651 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007652
Martin v. Löwis3d325192011-11-04 18:23:06 +01007653 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007654
Victor Stinner3a50e702011-10-18 21:21:00 +02007655 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007656 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007657 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007658 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007659
Victor Stinner2fc507f2011-11-04 20:06:39 +01007660 substring = PyUnicode_Substring(unicode, offset, offset+len);
7661 if (substring == NULL)
7662 return -1;
7663 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7664 if (p == NULL) {
7665 Py_DECREF(substring);
7666 return -1;
7667 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007668 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007670 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007672 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 NULL, 0,
7674 NULL, pusedDefaultChar);
7675 if (outsize <= 0)
7676 goto error;
7677 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007678 if (pusedDefaultChar && *pusedDefaultChar) {
7679 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007681 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007682
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007686 if (*outbytes == NULL) {
7687 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007689 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007691 }
7692 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 const Py_ssize_t n = PyBytes_Size(*outbytes);
7695 if (outsize > PY_SSIZE_T_MAX - n) {
7696 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007697 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007700 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7701 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007703 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007705 }
7706
7707 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007709 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007710 out, outsize,
7711 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007712 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 if (outsize <= 0)
7714 goto error;
7715 if (pusedDefaultChar && *pusedDefaultChar)
7716 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007717 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007718
Victor Stinner3a50e702011-10-18 21:21:00 +02007719error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007720 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7722 return -2;
7723 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007724 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007725}
7726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007728 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 * error handler.
7730 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007731 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 * -1 on other error.
7733 */
7734static int
7735encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007738{
Victor Stinner3a50e702011-10-18 21:21:00 +02007739 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007740 Py_ssize_t pos = unicode_offset;
7741 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007742 /* Ideally, we should get reason from FormatMessage. This is the Windows
7743 2000 English version of the message. */
7744 const char *reason = "invalid character";
7745 /* 4=maximum length of a UTF-8 sequence */
7746 char buffer[4];
7747 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7748 Py_ssize_t outsize;
7749 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007750 PyObject *errorHandler = NULL;
7751 PyObject *exc = NULL;
7752 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007753 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007754 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007755 PyObject *rep;
7756 int ret = -1;
7757
7758 assert(insize > 0);
7759
7760 encoding = code_page_name(code_page, &encoding_obj);
7761 if (encoding == NULL)
7762 return -1;
7763
7764 if (errors == NULL || strcmp(errors, "strict") == 0) {
7765 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7766 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007767 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007768 if (exc != NULL) {
7769 PyCodec_StrictErrors(exc);
7770 Py_DECREF(exc);
7771 }
7772 Py_XDECREF(encoding_obj);
7773 return -1;
7774 }
7775
7776 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7777 pusedDefaultChar = &usedDefaultChar;
7778 else
7779 pusedDefaultChar = NULL;
7780
7781 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7782 PyErr_NoMemory();
7783 goto error;
7784 }
7785 outsize = insize * Py_ARRAY_LENGTH(buffer);
7786
7787 if (*outbytes == NULL) {
7788 /* Create string object */
7789 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7790 if (*outbytes == NULL)
7791 goto error;
7792 out = PyBytes_AS_STRING(*outbytes);
7793 }
7794 else {
7795 /* Extend string object */
7796 Py_ssize_t n = PyBytes_Size(*outbytes);
7797 if (n > PY_SSIZE_T_MAX - outsize) {
7798 PyErr_NoMemory();
7799 goto error;
7800 }
7801 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7802 goto error;
7803 out = PyBytes_AS_STRING(*outbytes) + n;
7804 }
7805
7806 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007807 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007808 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007809 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7810 wchar_t chars[2];
7811 int charsize;
7812 if (ch < 0x10000) {
7813 chars[0] = (wchar_t)ch;
7814 charsize = 1;
7815 }
7816 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007817 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7818 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007819 charsize = 2;
7820 }
7821
Victor Stinner3a50e702011-10-18 21:21:00 +02007822 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007823 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007824 buffer, Py_ARRAY_LENGTH(buffer),
7825 NULL, pusedDefaultChar);
7826 if (outsize > 0) {
7827 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7828 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007829 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007830 memcpy(out, buffer, outsize);
7831 out += outsize;
7832 continue;
7833 }
7834 }
7835 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7836 PyErr_SetFromWindowsErr(0);
7837 goto error;
7838 }
7839
Victor Stinner3a50e702011-10-18 21:21:00 +02007840 rep = unicode_encode_call_errorhandler(
7841 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007842 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007843 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007844 if (rep == NULL)
7845 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007846 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007847
7848 if (PyBytes_Check(rep)) {
7849 outsize = PyBytes_GET_SIZE(rep);
7850 if (outsize != 1) {
7851 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7852 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7853 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7854 Py_DECREF(rep);
7855 goto error;
7856 }
7857 out = PyBytes_AS_STRING(*outbytes) + offset;
7858 }
7859 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7860 out += outsize;
7861 }
7862 else {
7863 Py_ssize_t i;
7864 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007865 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007866
Benjamin Petersonbac79492012-01-14 13:34:47 -05007867 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007868 Py_DECREF(rep);
7869 goto error;
7870 }
7871
7872 outsize = PyUnicode_GET_LENGTH(rep);
7873 if (outsize != 1) {
7874 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7875 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7876 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7877 Py_DECREF(rep);
7878 goto error;
7879 }
7880 out = PyBytes_AS_STRING(*outbytes) + offset;
7881 }
7882 kind = PyUnicode_KIND(rep);
7883 data = PyUnicode_DATA(rep);
7884 for (i=0; i < outsize; i++) {
7885 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7886 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007887 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007888 encoding, unicode,
7889 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007890 "unable to encode error handler result to ASCII");
7891 Py_DECREF(rep);
7892 goto error;
7893 }
7894 *out = (unsigned char)ch;
7895 out++;
7896 }
7897 }
7898 Py_DECREF(rep);
7899 }
7900 /* write a NUL byte */
7901 *out = 0;
7902 outsize = out - PyBytes_AS_STRING(*outbytes);
7903 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7904 if (_PyBytes_Resize(outbytes, outsize) < 0)
7905 goto error;
7906 ret = 0;
7907
7908error:
7909 Py_XDECREF(encoding_obj);
7910 Py_XDECREF(errorHandler);
7911 Py_XDECREF(exc);
7912 return ret;
7913}
7914
Victor Stinner3a50e702011-10-18 21:21:00 +02007915static PyObject *
7916encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007917 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007918 const char *errors)
7919{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007920 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007921 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007922 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007923 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007924
Victor Stinner29dacf22015-01-26 16:41:32 +01007925 if (!PyUnicode_Check(unicode)) {
7926 PyErr_BadArgument();
7927 return NULL;
7928 }
7929
Benjamin Petersonbac79492012-01-14 13:34:47 -05007930 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007931 return NULL;
7932 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007933
Victor Stinner3a50e702011-10-18 21:21:00 +02007934 if (code_page < 0) {
7935 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7936 return NULL;
7937 }
7938
Martin v. Löwis3d325192011-11-04 18:23:06 +01007939 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007940 return PyBytes_FromStringAndSize(NULL, 0);
7941
Victor Stinner7581cef2011-11-03 22:32:33 +01007942 offset = 0;
7943 do
7944 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007945#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007946 if (len > DECODING_CHUNK_SIZE) {
7947 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007948 done = 0;
7949 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007950 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007951#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007952 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007953 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007954 done = 1;
7955 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007956
Victor Stinner76a31a62011-11-04 00:05:13 +01007957 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007958 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007959 errors);
7960 if (ret == -2)
7961 ret = encode_code_page_errors(code_page, &outbytes,
7962 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007963 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007964 if (ret < 0) {
7965 Py_XDECREF(outbytes);
7966 return NULL;
7967 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007968
Victor Stinner7581cef2011-11-03 22:32:33 +01007969 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007970 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007971 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007972
Victor Stinner3a50e702011-10-18 21:21:00 +02007973 return outbytes;
7974}
7975
7976PyObject *
7977PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7978 Py_ssize_t size,
7979 const char *errors)
7980{
Victor Stinner7581cef2011-11-03 22:32:33 +01007981 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007982 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007983 if (unicode == NULL)
7984 return NULL;
7985 res = encode_code_page(CP_ACP, unicode, errors);
7986 Py_DECREF(unicode);
7987 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007988}
7989
7990PyObject *
7991PyUnicode_EncodeCodePage(int code_page,
7992 PyObject *unicode,
7993 const char *errors)
7994{
Victor Stinner7581cef2011-11-03 22:32:33 +01007995 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007996}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007997
Alexander Belopolsky40018472011-02-26 01:02:56 +00007998PyObject *
7999PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008000{
Victor Stinner7581cef2011-11-03 22:32:33 +01008001 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008002}
8003
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008004#undef NEED_RETRY
8005
Steve Dowercc16be82016-09-08 10:35:16 -07008006#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008007
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008/* --- Character Mapping Codec -------------------------------------------- */
8009
Victor Stinnerfb161b12013-04-18 01:44:27 +02008010static int
8011charmap_decode_string(const char *s,
8012 Py_ssize_t size,
8013 PyObject *mapping,
8014 const char *errors,
8015 _PyUnicodeWriter *writer)
8016{
8017 const char *starts = s;
8018 const char *e;
8019 Py_ssize_t startinpos, endinpos;
8020 PyObject *errorHandler = NULL, *exc = NULL;
8021 Py_ssize_t maplen;
8022 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008023 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008024 Py_UCS4 x;
8025 unsigned char ch;
8026
8027 if (PyUnicode_READY(mapping) == -1)
8028 return -1;
8029
8030 maplen = PyUnicode_GET_LENGTH(mapping);
8031 mapdata = PyUnicode_DATA(mapping);
8032 mapkind = PyUnicode_KIND(mapping);
8033
8034 e = s + size;
8035
8036 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8037 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8038 * is disabled in encoding aliases, latin1 is preferred because
8039 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008040 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008041 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8042 Py_UCS4 maxchar = writer->maxchar;
8043
8044 assert (writer->kind == PyUnicode_1BYTE_KIND);
8045 while (s < e) {
8046 ch = *s;
8047 x = mapdata_ucs1[ch];
8048 if (x > maxchar) {
8049 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8050 goto onError;
8051 maxchar = writer->maxchar;
8052 outdata = (Py_UCS1 *)writer->data;
8053 }
8054 outdata[writer->pos] = x;
8055 writer->pos++;
8056 ++s;
8057 }
8058 return 0;
8059 }
8060
8061 while (s < e) {
8062 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8063 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008064 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008065 if (outkind == PyUnicode_1BYTE_KIND) {
8066 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8067 Py_UCS4 maxchar = writer->maxchar;
8068 while (s < e) {
8069 ch = *s;
8070 x = mapdata_ucs2[ch];
8071 if (x > maxchar)
8072 goto Error;
8073 outdata[writer->pos] = x;
8074 writer->pos++;
8075 ++s;
8076 }
8077 break;
8078 }
8079 else if (outkind == PyUnicode_2BYTE_KIND) {
8080 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8081 while (s < e) {
8082 ch = *s;
8083 x = mapdata_ucs2[ch];
8084 if (x == 0xFFFE)
8085 goto Error;
8086 outdata[writer->pos] = x;
8087 writer->pos++;
8088 ++s;
8089 }
8090 break;
8091 }
8092 }
8093 ch = *s;
8094
8095 if (ch < maplen)
8096 x = PyUnicode_READ(mapkind, mapdata, ch);
8097 else
8098 x = 0xfffe; /* invalid value */
8099Error:
8100 if (x == 0xfffe)
8101 {
8102 /* undefined mapping */
8103 startinpos = s-starts;
8104 endinpos = startinpos+1;
8105 if (unicode_decode_call_errorhandler_writer(
8106 errors, &errorHandler,
8107 "charmap", "character maps to <undefined>",
8108 &starts, &e, &startinpos, &endinpos, &exc, &s,
8109 writer)) {
8110 goto onError;
8111 }
8112 continue;
8113 }
8114
8115 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8116 goto onError;
8117 ++s;
8118 }
8119 Py_XDECREF(errorHandler);
8120 Py_XDECREF(exc);
8121 return 0;
8122
8123onError:
8124 Py_XDECREF(errorHandler);
8125 Py_XDECREF(exc);
8126 return -1;
8127}
8128
8129static int
8130charmap_decode_mapping(const char *s,
8131 Py_ssize_t size,
8132 PyObject *mapping,
8133 const char *errors,
8134 _PyUnicodeWriter *writer)
8135{
8136 const char *starts = s;
8137 const char *e;
8138 Py_ssize_t startinpos, endinpos;
8139 PyObject *errorHandler = NULL, *exc = NULL;
8140 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008141 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008142
8143 e = s + size;
8144
8145 while (s < e) {
8146 ch = *s;
8147
8148 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8149 key = PyLong_FromLong((long)ch);
8150 if (key == NULL)
8151 goto onError;
8152
8153 item = PyObject_GetItem(mapping, key);
8154 Py_DECREF(key);
8155 if (item == NULL) {
8156 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8157 /* No mapping found means: mapping is undefined. */
8158 PyErr_Clear();
8159 goto Undefined;
8160 } else
8161 goto onError;
8162 }
8163
8164 /* Apply mapping */
8165 if (item == Py_None)
8166 goto Undefined;
8167 if (PyLong_Check(item)) {
8168 long value = PyLong_AS_LONG(item);
8169 if (value == 0xFFFE)
8170 goto Undefined;
8171 if (value < 0 || value > MAX_UNICODE) {
8172 PyErr_Format(PyExc_TypeError,
8173 "character mapping must be in range(0x%lx)",
8174 (unsigned long)MAX_UNICODE + 1);
8175 goto onError;
8176 }
8177
8178 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8179 goto onError;
8180 }
8181 else if (PyUnicode_Check(item)) {
8182 if (PyUnicode_READY(item) == -1)
8183 goto onError;
8184 if (PyUnicode_GET_LENGTH(item) == 1) {
8185 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8186 if (value == 0xFFFE)
8187 goto Undefined;
8188 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8189 goto onError;
8190 }
8191 else {
8192 writer->overallocate = 1;
8193 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8194 goto onError;
8195 }
8196 }
8197 else {
8198 /* wrong return value */
8199 PyErr_SetString(PyExc_TypeError,
8200 "character mapping must return integer, None or str");
8201 goto onError;
8202 }
8203 Py_CLEAR(item);
8204 ++s;
8205 continue;
8206
8207Undefined:
8208 /* undefined mapping */
8209 Py_CLEAR(item);
8210 startinpos = s-starts;
8211 endinpos = startinpos+1;
8212 if (unicode_decode_call_errorhandler_writer(
8213 errors, &errorHandler,
8214 "charmap", "character maps to <undefined>",
8215 &starts, &e, &startinpos, &endinpos, &exc, &s,
8216 writer)) {
8217 goto onError;
8218 }
8219 }
8220 Py_XDECREF(errorHandler);
8221 Py_XDECREF(exc);
8222 return 0;
8223
8224onError:
8225 Py_XDECREF(item);
8226 Py_XDECREF(errorHandler);
8227 Py_XDECREF(exc);
8228 return -1;
8229}
8230
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231PyObject *
8232PyUnicode_DecodeCharmap(const char *s,
8233 Py_ssize_t size,
8234 PyObject *mapping,
8235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008237 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008238
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 /* Default to Latin-1 */
8240 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008244 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008245 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008246 writer.min_length = size;
8247 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008249
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008250 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008251 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8252 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008253 }
8254 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008255 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8256 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008258 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008259
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008261 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262 return NULL;
8263}
8264
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265/* Charmap encoding: the lookup table */
8266
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 PyObject_HEAD
8269 unsigned char level1[32];
8270 int count2, count3;
8271 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272};
8273
8274static PyObject*
8275encoding_map_size(PyObject *obj, PyObject* args)
8276{
8277 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008278 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008280}
8281
8282static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 PyDoc_STR("Return the size (in bytes) of this object") },
8285 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008286};
8287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 "EncodingMap", /*tp_name*/
8291 sizeof(struct encoding_map), /*tp_basicsize*/
8292 0, /*tp_itemsize*/
8293 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008294 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008295 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 0, /*tp_getattr*/
8297 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008298 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 0, /*tp_repr*/
8300 0, /*tp_as_number*/
8301 0, /*tp_as_sequence*/
8302 0, /*tp_as_mapping*/
8303 0, /*tp_hash*/
8304 0, /*tp_call*/
8305 0, /*tp_str*/
8306 0, /*tp_getattro*/
8307 0, /*tp_setattro*/
8308 0, /*tp_as_buffer*/
8309 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8310 0, /*tp_doc*/
8311 0, /*tp_traverse*/
8312 0, /*tp_clear*/
8313 0, /*tp_richcompare*/
8314 0, /*tp_weaklistoffset*/
8315 0, /*tp_iter*/
8316 0, /*tp_iternext*/
8317 encoding_map_methods, /*tp_methods*/
8318 0, /*tp_members*/
8319 0, /*tp_getset*/
8320 0, /*tp_base*/
8321 0, /*tp_dict*/
8322 0, /*tp_descr_get*/
8323 0, /*tp_descr_set*/
8324 0, /*tp_dictoffset*/
8325 0, /*tp_init*/
8326 0, /*tp_alloc*/
8327 0, /*tp_new*/
8328 0, /*tp_free*/
8329 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330};
8331
8332PyObject*
8333PyUnicode_BuildEncodingMap(PyObject* string)
8334{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008335 PyObject *result;
8336 struct encoding_map *mresult;
8337 int i;
8338 int need_dict = 0;
8339 unsigned char level1[32];
8340 unsigned char level2[512];
8341 unsigned char *mlevel1, *mlevel2, *mlevel3;
8342 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008344 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008345 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008347
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008348 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 PyErr_BadArgument();
8350 return NULL;
8351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 kind = PyUnicode_KIND(string);
8353 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008354 length = PyUnicode_GET_LENGTH(string);
8355 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 memset(level1, 0xFF, sizeof level1);
8357 memset(level2, 0xFF, sizeof level2);
8358
8359 /* If there isn't a one-to-one mapping of NULL to \0,
8360 or if there are non-BMP characters, we need to use
8361 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008364 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 ch = PyUnicode_READ(kind, data, i);
8367 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 need_dict = 1;
8369 break;
8370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 /* unmapped character */
8373 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 l1 = ch >> 11;
8375 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 if (level1[l1] == 0xFF)
8377 level1[l1] = count2++;
8378 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008379 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 }
8381
8382 if (count2 >= 0xFF || count3 >= 0xFF)
8383 need_dict = 1;
8384
8385 if (need_dict) {
8386 PyObject *result = PyDict_New();
8387 PyObject *key, *value;
8388 if (!result)
8389 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008390 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008392 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008393 if (!key || !value)
8394 goto failed1;
8395 if (PyDict_SetItem(result, key, value) == -1)
8396 goto failed1;
8397 Py_DECREF(key);
8398 Py_DECREF(value);
8399 }
8400 return result;
8401 failed1:
8402 Py_XDECREF(key);
8403 Py_XDECREF(value);
8404 Py_DECREF(result);
8405 return NULL;
8406 }
8407
8408 /* Create a three-level trie */
8409 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8410 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008411 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008413 }
8414
8415 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416 mresult = (struct encoding_map*)result;
8417 mresult->count2 = count2;
8418 mresult->count3 = count3;
8419 mlevel1 = mresult->level1;
8420 mlevel2 = mresult->level23;
8421 mlevel3 = mresult->level23 + 16*count2;
8422 memcpy(mlevel1, level1, 32);
8423 memset(mlevel2, 0xFF, 16*count2);
8424 memset(mlevel3, 0, 128*count3);
8425 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008426 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008427 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008428 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8429 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 /* unmapped character */
8431 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008432 o1 = ch>>11;
8433 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008434 i2 = 16*mlevel1[o1] + o2;
8435 if (mlevel2[i2] == 0xFF)
8436 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008437 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008438 i3 = 128*mlevel2[i2] + o3;
8439 mlevel3[i3] = i;
8440 }
8441 return result;
8442}
8443
8444static int
Victor Stinner22168992011-11-20 17:09:18 +01008445encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008446{
8447 struct encoding_map *map = (struct encoding_map*)mapping;
8448 int l1 = c>>11;
8449 int l2 = (c>>7) & 0xF;
8450 int l3 = c & 0x7F;
8451 int i;
8452
Victor Stinner22168992011-11-20 17:09:18 +01008453 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 if (c == 0)
8456 return 0;
8457 /* level 1*/
8458 i = map->level1[l1];
8459 if (i == 0xFF) {
8460 return -1;
8461 }
8462 /* level 2*/
8463 i = map->level23[16*i+l2];
8464 if (i == 0xFF) {
8465 return -1;
8466 }
8467 /* level 3 */
8468 i = map->level23[16*map->count2 + 128*i + l3];
8469 if (i == 0) {
8470 return -1;
8471 }
8472 return i;
8473}
8474
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475/* Lookup the character ch in the mapping. If the character
8476 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008477 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008478static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008479charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480{
Christian Heimes217cfd12007-12-02 14:31:20 +00008481 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 PyObject *x;
8483
8484 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 x = PyObject_GetItem(mapping, w);
8487 Py_DECREF(w);
8488 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8490 /* No mapping found means: mapping is undefined. */
8491 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008492 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 } else
8494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008496 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008498 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 long value = PyLong_AS_LONG(x);
8500 if (value < 0 || value > 255) {
8501 PyErr_SetString(PyExc_TypeError,
8502 "character mapping must be in range(256)");
8503 Py_DECREF(x);
8504 return NULL;
8505 }
8506 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008508 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 /* wrong return value */
8512 PyErr_Format(PyExc_TypeError,
8513 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008514 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 Py_DECREF(x);
8516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 }
8518}
8519
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008520static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008521charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008522{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008523 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8524 /* exponentially overallocate to minimize reallocations */
8525 if (requiredsize < 2*outsize)
8526 requiredsize = 2*outsize;
8527 if (_PyBytes_Resize(outobj, requiredsize))
8528 return -1;
8529 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008530}
8531
Benjamin Peterson14339b62009-01-31 16:36:08 +00008532typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008536 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 space is available. Return a new reference to the object that
8538 was put in the output buffer, or Py_None, if the mapping was undefined
8539 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008540 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008541static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008542charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008543 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008545 PyObject *rep;
8546 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008547 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548
Andy Lesterdffe4c02020-03-04 07:15:20 -06008549 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008550 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008552 if (res == -1)
8553 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 if (outsize<requiredsize)
8555 if (charmapencode_resize(outobj, outpos, requiredsize))
8556 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008557 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 outstart[(*outpos)++] = (char)res;
8559 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008560 }
8561
8562 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008565 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 Py_DECREF(rep);
8567 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008568 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 if (PyLong_Check(rep)) {
8570 Py_ssize_t requiredsize = *outpos+1;
8571 if (outsize<requiredsize)
8572 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8573 Py_DECREF(rep);
8574 return enc_EXCEPTION;
8575 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008576 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 else {
8580 const char *repchars = PyBytes_AS_STRING(rep);
8581 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8582 Py_ssize_t requiredsize = *outpos+repsize;
8583 if (outsize<requiredsize)
8584 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8585 Py_DECREF(rep);
8586 return enc_EXCEPTION;
8587 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008588 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 memcpy(outstart + *outpos, repchars, repsize);
8590 *outpos += repsize;
8591 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008593 Py_DECREF(rep);
8594 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595}
8596
8597/* handle an error in PyUnicode_EncodeCharmap
8598 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008599static int
8600charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008601 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008603 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008604 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008605{
8606 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008608 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008609 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008610 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008611 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008613 Py_ssize_t collstartpos = *inpos;
8614 Py_ssize_t collendpos = *inpos+1;
8615 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008616 const char *encoding = "charmap";
8617 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008618 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008619 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008620 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621
Benjamin Petersonbac79492012-01-14 13:34:47 -05008622 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008623 return -1;
8624 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 /* find all unencodable characters */
8626 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008627 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008628 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008629 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008630 val = encoding_map_lookup(ch, mapping);
8631 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 break;
8633 ++collendpos;
8634 continue;
8635 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008636
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008637 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8638 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 if (rep==NULL)
8640 return -1;
8641 else if (rep!=Py_None) {
8642 Py_DECREF(rep);
8643 break;
8644 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008645 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 }
8648 /* cache callback name lookup
8649 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008650 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008651 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008652
8653 switch (*error_handler) {
8654 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008655 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008656 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008657
8658 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 x = charmapencode_output('?', mapping, res, respos);
8661 if (x==enc_EXCEPTION) {
8662 return -1;
8663 }
8664 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008665 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 return -1;
8667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008668 }
8669 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008670 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008671 *inpos = collendpos;
8672 break;
Victor Stinner50149202015-09-22 00:26:54 +02008673
8674 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008675 /* generate replacement (temporarily (mis)uses p) */
8676 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 char buffer[2+29+1+1];
8678 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008679 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 for (cp = buffer; *cp; ++cp) {
8681 x = charmapencode_output(*cp, mapping, res, respos);
8682 if (x==enc_EXCEPTION)
8683 return -1;
8684 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008685 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return -1;
8687 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008688 }
8689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008690 *inpos = collendpos;
8691 break;
Victor Stinner50149202015-09-22 00:26:54 +02008692
Benjamin Peterson14339b62009-01-31 16:36:08 +00008693 default:
Victor Stinner50149202015-09-22 00:26:54 +02008694 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008695 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008697 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008699 if (PyBytes_Check(repunicode)) {
8700 /* Directly copy bytes result to output. */
8701 Py_ssize_t outsize = PyBytes_Size(*res);
8702 Py_ssize_t requiredsize;
8703 repsize = PyBytes_Size(repunicode);
8704 requiredsize = *respos + repsize;
8705 if (requiredsize > outsize)
8706 /* Make room for all additional bytes. */
8707 if (charmapencode_resize(res, respos, requiredsize)) {
8708 Py_DECREF(repunicode);
8709 return -1;
8710 }
8711 memcpy(PyBytes_AsString(*res) + *respos,
8712 PyBytes_AsString(repunicode), repsize);
8713 *respos += repsize;
8714 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008715 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008716 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008717 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008719 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008720 Py_DECREF(repunicode);
8721 return -1;
8722 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008723 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008724 data = PyUnicode_DATA(repunicode);
8725 kind = PyUnicode_KIND(repunicode);
8726 for (index = 0; index < repsize; index++) {
8727 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8728 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008730 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 return -1;
8732 }
8733 else if (x==enc_FAILED) {
8734 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008735 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 return -1;
8737 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008738 }
8739 *inpos = newpos;
8740 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 }
8742 return 0;
8743}
8744
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008746_PyUnicode_EncodeCharmap(PyObject *unicode,
8747 PyObject *mapping,
8748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 /* output object */
8751 PyObject *res = NULL;
8752 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008753 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008754 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008756 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008757 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008759 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008760 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008761 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008762
Benjamin Petersonbac79492012-01-14 13:34:47 -05008763 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008764 return NULL;
8765 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008766 data = PyUnicode_DATA(unicode);
8767 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008768
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 /* Default to Latin-1 */
8770 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773 /* allocate enough for a simple encoding without
8774 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008775 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 if (res == NULL)
8777 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008778 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008782 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008784 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 if (x==enc_EXCEPTION) /* error */
8786 goto onError;
8787 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008788 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008790 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 &res, &respos)) {
8792 goto onError;
8793 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 else
8796 /* done with this character => adjust input position */
8797 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008800 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008801 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008802 if (_PyBytes_Resize(&res, respos) < 0)
8803 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008806 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807 return res;
8808
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008810 Py_XDECREF(res);
8811 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008812 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 return NULL;
8814}
8815
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008816/* Deprecated */
8817PyObject *
8818PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8819 Py_ssize_t size,
8820 PyObject *mapping,
8821 const char *errors)
8822{
8823 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008824 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008825 if (unicode == NULL)
8826 return NULL;
8827 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8828 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008829 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008830}
8831
Alexander Belopolsky40018472011-02-26 01:02:56 +00008832PyObject *
8833PyUnicode_AsCharmapString(PyObject *unicode,
8834 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835{
8836 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 PyErr_BadArgument();
8838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008840 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841}
8842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008844static void
8845make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008847 Py_ssize_t startpos, Py_ssize_t endpos,
8848 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008850 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 *exceptionObject = _PyUnicodeTranslateError_Create(
8852 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 }
8854 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8856 goto onError;
8857 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8858 goto onError;
8859 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8860 goto onError;
8861 return;
8862 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008863 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864 }
8865}
8866
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008867/* error handling callback helper:
8868 build arguments, call the callback and check the arguments,
8869 put the result into newpos and return the replacement string, which
8870 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008871static PyObject *
8872unicode_translate_call_errorhandler(const char *errors,
8873 PyObject **errorHandler,
8874 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008876 Py_ssize_t startpos, Py_ssize_t endpos,
8877 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008879 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008881 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 PyObject *restuple;
8883 PyObject *resunicode;
8884
8885 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008887 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889 }
8890
8891 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008895
Petr Viktorinffd97532020-02-11 17:46:57 +01008896 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008897 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008899 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008900 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 Py_DECREF(restuple);
8902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008903 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008904 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 &resunicode, &i_newpos)) {
8906 Py_DECREF(restuple);
8907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008909 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008911 else
8912 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008914 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 Py_DECREF(restuple);
8916 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008917 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918 Py_INCREF(resunicode);
8919 Py_DECREF(restuple);
8920 return resunicode;
8921}
8922
8923/* Lookup the character ch in the mapping and put the result in result,
8924 which must be decrefed by the caller.
8925 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008926static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008928{
Christian Heimes217cfd12007-12-02 14:31:20 +00008929 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 PyObject *x;
8931
8932 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008934 x = PyObject_GetItem(mapping, w);
8935 Py_DECREF(w);
8936 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8938 /* No mapping found means: use 1:1 mapping. */
8939 PyErr_Clear();
8940 *result = NULL;
8941 return 0;
8942 } else
8943 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008944 }
8945 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 *result = x;
8947 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008949 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008951 if (value < 0 || value > MAX_UNICODE) {
8952 PyErr_Format(PyExc_ValueError,
8953 "character mapping must be in range(0x%x)",
8954 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 Py_DECREF(x);
8956 return -1;
8957 }
8958 *result = x;
8959 return 0;
8960 }
8961 else if (PyUnicode_Check(x)) {
8962 *result = x;
8963 return 0;
8964 }
8965 else {
8966 /* wrong return value */
8967 PyErr_SetString(PyExc_TypeError,
8968 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008969 Py_DECREF(x);
8970 return -1;
8971 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972}
Victor Stinner1194ea02014-04-04 19:37:40 +02008973
8974/* lookup the character, write the result into the writer.
8975 Return 1 if the result was written into the writer, return 0 if the mapping
8976 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008977static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008978charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8979 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980{
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 PyObject *item;
8982
8983 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008985
8986 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008991 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008992 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008993
8994 if (item == Py_None) {
8995 Py_DECREF(item);
8996 return 0;
8997 }
8998
8999 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009000 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9001 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9002 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009003 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9004 Py_DECREF(item);
9005 return -1;
9006 }
9007 Py_DECREF(item);
9008 return 1;
9009 }
9010
9011 if (!PyUnicode_Check(item)) {
9012 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009014 }
9015
9016 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9017 Py_DECREF(item);
9018 return -1;
9019 }
9020
9021 Py_DECREF(item);
9022 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009023}
9024
Victor Stinner89a76ab2014-04-05 11:44:04 +02009025static int
9026unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9027 Py_UCS1 *translate)
9028{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009029 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009030 int ret = 0;
9031
Victor Stinner89a76ab2014-04-05 11:44:04 +02009032 if (charmaptranslate_lookup(ch, mapping, &item)) {
9033 return -1;
9034 }
9035
9036 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009037 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009038 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009039 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009040 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009041 /* not found => default to 1:1 mapping */
9042 translate[ch] = ch;
9043 return 1;
9044 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009045 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009046 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009047 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9048 used it */
9049 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009050 /* invalid character or character outside ASCII:
9051 skip the fast translate */
9052 goto exit;
9053 }
9054 translate[ch] = (Py_UCS1)replace;
9055 }
9056 else if (PyUnicode_Check(item)) {
9057 Py_UCS4 replace;
9058
9059 if (PyUnicode_READY(item) == -1) {
9060 Py_DECREF(item);
9061 return -1;
9062 }
9063 if (PyUnicode_GET_LENGTH(item) != 1)
9064 goto exit;
9065
9066 replace = PyUnicode_READ_CHAR(item, 0);
9067 if (replace > 127)
9068 goto exit;
9069 translate[ch] = (Py_UCS1)replace;
9070 }
9071 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009072 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009073 goto exit;
9074 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009075 ret = 1;
9076
Benjamin Peterson1365de72014-04-07 20:15:41 -04009077 exit:
9078 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079 return ret;
9080}
9081
9082/* Fast path for ascii => ascii translation. Return 1 if the whole string
9083 was translated into writer, return 0 if the input string was partially
9084 translated into writer, raise an exception and return -1 on error. */
9085static int
9086unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009087 _PyUnicodeWriter *writer, int ignore,
9088 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009089{
Victor Stinner872b2912014-04-05 14:27:07 +02009090 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009091 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009092 const Py_UCS1 *in, *end;
9093 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009094 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009095
Victor Stinner89a76ab2014-04-05 11:44:04 +02009096 len = PyUnicode_GET_LENGTH(input);
9097
Victor Stinner872b2912014-04-05 14:27:07 +02009098 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009099
9100 in = PyUnicode_1BYTE_DATA(input);
9101 end = in + len;
9102
9103 assert(PyUnicode_IS_ASCII(writer->buffer));
9104 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9105 out = PyUnicode_1BYTE_DATA(writer->buffer);
9106
Victor Stinner872b2912014-04-05 14:27:07 +02009107 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009108 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009109 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009110 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009111 int translate = unicode_fast_translate_lookup(mapping, ch,
9112 ascii_table);
9113 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009114 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009115 if (translate == 0)
9116 goto exit;
9117 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009118 }
Victor Stinner872b2912014-04-05 14:27:07 +02009119 if (ch2 == 0xfe) {
9120 if (ignore)
9121 continue;
9122 goto exit;
9123 }
9124 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009125 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009126 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009127 }
Victor Stinner872b2912014-04-05 14:27:07 +02009128 res = 1;
9129
9130exit:
9131 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009132 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009133 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009134}
9135
Victor Stinner3222da22015-10-01 22:07:32 +02009136static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137_PyUnicode_TranslateCharmap(PyObject *input,
9138 PyObject *mapping,
9139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009142 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 Py_ssize_t size, i;
9144 int kind;
9145 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009146 _PyUnicodeWriter writer;
9147 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009148 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009149 PyObject *errorHandler = NULL;
9150 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009151 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009152 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009153
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 PyErr_BadArgument();
9156 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 if (PyUnicode_READY(input) == -1)
9160 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009161 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 kind = PyUnicode_KIND(input);
9163 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009165 if (size == 0)
9166 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009168 /* allocate enough for a simple 1:1 translation without
9169 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009170 _PyUnicodeWriter_Init(&writer);
9171 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173
Victor Stinner872b2912014-04-05 14:27:07 +02009174 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9175
Victor Stinner33798672016-03-01 21:59:58 +01009176 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009177 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009178 if (PyUnicode_IS_ASCII(input)) {
9179 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9180 if (res < 0) {
9181 _PyUnicodeWriter_Dealloc(&writer);
9182 return NULL;
9183 }
9184 if (res == 1)
9185 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009186 }
Victor Stinner33798672016-03-01 21:59:58 +01009187 else {
9188 i = 0;
9189 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009193 int translate;
9194 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9195 Py_ssize_t newpos;
9196 /* startpos for collecting untranslatable chars */
9197 Py_ssize_t collstart;
9198 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009199 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200
Victor Stinner1194ea02014-04-04 19:37:40 +02009201 ch = PyUnicode_READ(kind, data, i);
9202 translate = charmaptranslate_output(ch, mapping, &writer);
9203 if (translate < 0)
9204 goto onError;
9205
9206 if (translate != 0) {
9207 /* it worked => adjust input pointer */
9208 ++i;
9209 continue;
9210 }
9211
9212 /* untranslatable character */
9213 collstart = i;
9214 collend = i+1;
9215
9216 /* find all untranslatable characters */
9217 while (collend < size) {
9218 PyObject *x;
9219 ch = PyUnicode_READ(kind, data, collend);
9220 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009221 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009222 Py_XDECREF(x);
9223 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009225 ++collend;
9226 }
9227
9228 if (ignore) {
9229 i = collend;
9230 }
9231 else {
9232 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9233 reason, input, &exc,
9234 collstart, collend, &newpos);
9235 if (repunicode == NULL)
9236 goto onError;
9237 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009239 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009240 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009241 Py_DECREF(repunicode);
9242 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009243 }
9244 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009245 Py_XDECREF(exc);
9246 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009247 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248
Benjamin Peterson29060642009-01-31 22:14:21 +00009249 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009250 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009251 Py_XDECREF(exc);
9252 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 return NULL;
9254}
9255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256/* Deprecated. Use PyUnicode_Translate instead. */
9257PyObject *
9258PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9259 Py_ssize_t size,
9260 PyObject *mapping,
9261 const char *errors)
9262{
Christian Heimes5f520f42012-09-11 14:03:25 +02009263 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009264 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 if (!unicode)
9266 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009267 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9268 Py_DECREF(unicode);
9269 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270}
9271
Alexander Belopolsky40018472011-02-26 01:02:56 +00009272PyObject *
9273PyUnicode_Translate(PyObject *str,
9274 PyObject *mapping,
9275 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009277 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009278 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009279 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280}
Tim Petersced69f82003-09-16 20:30:58 +00009281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282PyObject *
9283_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9284{
9285 if (!PyUnicode_Check(unicode)) {
9286 PyErr_BadInternalCall();
9287 return NULL;
9288 }
9289 if (PyUnicode_READY(unicode) == -1)
9290 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009291 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 /* If the string is already ASCII, just return the same string */
9293 Py_INCREF(unicode);
9294 return unicode;
9295 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009296
9297 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9298 PyObject *result = PyUnicode_New(len, 127);
9299 if (result == NULL) {
9300 return NULL;
9301 }
9302
9303 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9304 int kind = PyUnicode_KIND(unicode);
9305 const void *data = PyUnicode_DATA(unicode);
9306 Py_ssize_t i;
9307 for (i = 0; i < len; ++i) {
9308 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9309 if (ch < 127) {
9310 out[i] = ch;
9311 }
9312 else if (Py_UNICODE_ISSPACE(ch)) {
9313 out[i] = ' ';
9314 }
9315 else {
9316 int decimal = Py_UNICODE_TODECIMAL(ch);
9317 if (decimal < 0) {
9318 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009319 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009320 _PyUnicode_LENGTH(result) = i + 1;
9321 break;
9322 }
9323 out[i] = '0' + decimal;
9324 }
9325 }
9326
INADA Naoki16dfca42018-07-14 12:06:43 +09009327 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009328 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329}
9330
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009331PyObject *
9332PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9333 Py_ssize_t length)
9334{
Victor Stinnerf0124502011-11-21 23:12:56 +01009335 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009336 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009337 Py_UCS4 maxchar;
9338 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009339 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009340
Victor Stinner99d7ad02012-02-22 13:37:39 +01009341 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009342 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009343 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009344 if (ch > 127) {
9345 int decimal = Py_UNICODE_TODECIMAL(ch);
9346 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009347 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009348 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009349 }
9350 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009351
9352 /* Copy to a new string */
9353 decimal = PyUnicode_New(length, maxchar);
9354 if (decimal == NULL)
9355 return decimal;
9356 kind = PyUnicode_KIND(decimal);
9357 data = PyUnicode_DATA(decimal);
9358 /* Iterate over code points */
9359 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009360 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009361 if (ch > 127) {
9362 int decimal = Py_UNICODE_TODECIMAL(ch);
9363 if (decimal >= 0)
9364 ch = '0' + decimal;
9365 }
9366 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009368 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009369}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009370/* --- Decimal Encoder ---------------------------------------------------- */
9371
Alexander Belopolsky40018472011-02-26 01:02:56 +00009372int
9373PyUnicode_EncodeDecimal(Py_UNICODE *s,
9374 Py_ssize_t length,
9375 char *output,
9376 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009377{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009378 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009379 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009380 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009381 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009382
9383 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 PyErr_BadArgument();
9385 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009386 }
9387
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009388 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009389 if (unicode == NULL)
9390 return -1;
9391
Victor Stinner42bf7752011-11-21 22:52:58 +01009392 kind = PyUnicode_KIND(unicode);
9393 data = PyUnicode_DATA(unicode);
9394
Victor Stinnerb84d7232011-11-22 01:50:07 +01009395 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009396 PyObject *exc;
9397 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009399 Py_ssize_t startpos;
9400
9401 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009402
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009405 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 decimal = Py_UNICODE_TODECIMAL(ch);
9409 if (decimal >= 0) {
9410 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009411 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009412 continue;
9413 }
9414 if (0 < ch && ch < 256) {
9415 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009416 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 continue;
9418 }
Victor Stinner6345be92011-11-25 20:09:01 +01009419
Victor Stinner42bf7752011-11-21 22:52:58 +01009420 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009421 exc = NULL;
9422 raise_encode_exception(&exc, "decimal", unicode,
9423 startpos, startpos+1,
9424 "invalid decimal Unicode string");
9425 Py_XDECREF(exc);
9426 Py_DECREF(unicode);
9427 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009428 }
9429 /* 0-terminate the output string */
9430 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009431 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009432 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009433}
9434
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435/* --- Helpers ------------------------------------------------------------ */
9436
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009437/* helper macro to fixup start/end slice values */
9438#define ADJUST_INDICES(start, end, len) \
9439 if (end > len) \
9440 end = len; \
9441 else if (end < 0) { \
9442 end += len; \
9443 if (end < 0) \
9444 end = 0; \
9445 } \
9446 if (start < 0) { \
9447 start += len; \
9448 if (start < 0) \
9449 start = 0; \
9450 }
9451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009453any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009455 Py_ssize_t end,
9456 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009458 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009459 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 Py_ssize_t len1, len2, result;
9461
9462 kind1 = PyUnicode_KIND(s1);
9463 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009464 if (kind1 < kind2)
9465 return -1;
9466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 len1 = PyUnicode_GET_LENGTH(s1);
9468 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009469 ADJUST_INDICES(start, end, len1);
9470 if (end - start < len2)
9471 return -1;
9472
9473 buf1 = PyUnicode_DATA(s1);
9474 buf2 = PyUnicode_DATA(s2);
9475 if (len2 == 1) {
9476 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9477 result = findchar((const char *)buf1 + kind1*start,
9478 kind1, end - start, ch, direction);
9479 if (result == -1)
9480 return -1;
9481 else
9482 return start + result;
9483 }
9484
9485 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009486 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009487 if (!buf2)
9488 return -2;
9489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490
Victor Stinner794d5672011-10-10 03:21:36 +02009491 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009492 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009493 case PyUnicode_1BYTE_KIND:
9494 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9495 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9496 else
9497 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9498 break;
9499 case PyUnicode_2BYTE_KIND:
9500 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9501 break;
9502 case PyUnicode_4BYTE_KIND:
9503 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9504 break;
9505 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009506 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009507 }
9508 }
9509 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009510 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009511 case PyUnicode_1BYTE_KIND:
9512 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9513 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9514 else
9515 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9516 break;
9517 case PyUnicode_2BYTE_KIND:
9518 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9519 break;
9520 case PyUnicode_4BYTE_KIND:
9521 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9522 break;
9523 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009524 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 }
9527
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009528 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009529 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009530 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531
9532 return result;
9533}
9534
Victor Stinner59423e32018-11-26 13:40:01 +01009535/* _PyUnicode_InsertThousandsGrouping() helper functions */
9536#include "stringlib/localeutil.h"
9537
9538/**
9539 * InsertThousandsGrouping:
9540 * @writer: Unicode writer.
9541 * @n_buffer: Number of characters in @buffer.
9542 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9543 * @d_pos: Start of digits string.
9544 * @n_digits: The number of digits in the string, in which we want
9545 * to put the grouping chars.
9546 * @min_width: The minimum width of the digits in the output string.
9547 * Output will be zero-padded on the left to fill.
9548 * @grouping: see definition in localeconv().
9549 * @thousands_sep: see definition in localeconv().
9550 *
9551 * There are 2 modes: counting and filling. If @writer is NULL,
9552 * we are in counting mode, else filling mode.
9553 * If counting, the required buffer size is returned.
9554 * If filling, we know the buffer will be large enough, so we don't
9555 * need to pass in the buffer size.
9556 * Inserts thousand grouping characters (as defined by grouping and
9557 * thousands_sep) into @writer.
9558 *
9559 * Return value: -1 on error, number of characters otherwise.
9560 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009562_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009563 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009564 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009565 PyObject *digits,
9566 Py_ssize_t d_pos,
9567 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009568 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009569 const char *grouping,
9570 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009571 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572{
Xtreak3f7983a2019-01-07 20:39:14 +05309573 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009574 if (writer) {
9575 assert(digits != NULL);
9576 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009577 }
9578 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009579 assert(digits == NULL);
9580 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009581 }
Victor Stinner59423e32018-11-26 13:40:01 +01009582 assert(0 <= d_pos);
9583 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009584 assert(grouping != NULL);
9585
9586 if (digits != NULL) {
9587 if (PyUnicode_READY(digits) == -1) {
9588 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009589 }
Victor Stinner59423e32018-11-26 13:40:01 +01009590 }
9591 if (PyUnicode_READY(thousands_sep) == -1) {
9592 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009593 }
9594
Victor Stinner59423e32018-11-26 13:40:01 +01009595 Py_ssize_t count = 0;
9596 Py_ssize_t n_zeros;
9597 int loop_broken = 0;
9598 int use_separator = 0; /* First time through, don't append the
9599 separator. They only go between
9600 groups. */
9601 Py_ssize_t buffer_pos;
9602 Py_ssize_t digits_pos;
9603 Py_ssize_t len;
9604 Py_ssize_t n_chars;
9605 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9606 be looked at */
9607 /* A generator that returns all of the grouping widths, until it
9608 returns 0. */
9609 GroupGenerator groupgen;
9610 GroupGenerator_init(&groupgen, grouping);
9611 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9612
9613 /* if digits are not grouped, thousands separator
9614 should be an empty string */
9615 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9616
9617 digits_pos = d_pos + n_digits;
9618 if (writer) {
9619 buffer_pos = writer->pos + n_buffer;
9620 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9621 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 }
Victor Stinner59423e32018-11-26 13:40:01 +01009623 else {
9624 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009625 }
Victor Stinner59423e32018-11-26 13:40:01 +01009626
9627 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009628 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009629 }
Victor Stinner59423e32018-11-26 13:40:01 +01009630
9631 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9632 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9633 n_zeros = Py_MAX(0, len - remaining);
9634 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9635
9636 /* Use n_zero zero's and n_chars chars */
9637
9638 /* Count only, don't do anything. */
9639 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9640
9641 /* Copy into the writer. */
9642 InsertThousandsGrouping_fill(writer, &buffer_pos,
9643 digits, &digits_pos,
9644 n_chars, n_zeros,
9645 use_separator ? thousands_sep : NULL,
9646 thousands_sep_len, maxchar);
9647
9648 /* Use a separator next time. */
9649 use_separator = 1;
9650
9651 remaining -= n_chars;
9652 min_width -= len;
9653
9654 if (remaining <= 0 && min_width <= 0) {
9655 loop_broken = 1;
9656 break;
9657 }
9658 min_width -= thousands_sep_len;
9659 }
9660 if (!loop_broken) {
9661 /* We left the loop without using a break statement. */
9662
9663 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9664 n_zeros = Py_MAX(0, len - remaining);
9665 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9666
9667 /* Use n_zero zero's and n_chars chars */
9668 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9669
9670 /* Copy into the writer. */
9671 InsertThousandsGrouping_fill(writer, &buffer_pos,
9672 digits, &digits_pos,
9673 n_chars, n_zeros,
9674 use_separator ? thousands_sep : NULL,
9675 thousands_sep_len, maxchar);
9676 }
9677 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678}
9679
9680
Alexander Belopolsky40018472011-02-26 01:02:56 +00009681Py_ssize_t
9682PyUnicode_Count(PyObject *str,
9683 PyObject *substr,
9684 Py_ssize_t start,
9685 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009687 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009688 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009689 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009691
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009692 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009694
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009695 kind1 = PyUnicode_KIND(str);
9696 kind2 = PyUnicode_KIND(substr);
9697 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009698 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009699
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009700 len1 = PyUnicode_GET_LENGTH(str);
9701 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009703 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009704 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009705
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009706 buf1 = PyUnicode_DATA(str);
9707 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009708 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009709 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009710 if (!buf2)
9711 goto onError;
9712 }
9713
9714 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009716 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009717 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009718 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009719 buf2, len2, PY_SSIZE_T_MAX
9720 );
9721 else
9722 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009723 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009724 buf2, len2, PY_SSIZE_T_MAX
9725 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 break;
9727 case PyUnicode_2BYTE_KIND:
9728 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009729 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 buf2, len2, PY_SSIZE_T_MAX
9731 );
9732 break;
9733 case PyUnicode_4BYTE_KIND:
9734 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009735 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 buf2, len2, PY_SSIZE_T_MAX
9737 );
9738 break;
9739 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009740 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009742
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009743 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009744 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009745 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009749 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9750 if (kind2 != kind1)
9751 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753}
9754
Alexander Belopolsky40018472011-02-26 01:02:56 +00009755Py_ssize_t
9756PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009757 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009758 Py_ssize_t start,
9759 Py_ssize_t end,
9760 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009762 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009764
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009765 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766}
9767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768Py_ssize_t
9769PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9770 Py_ssize_t start, Py_ssize_t end,
9771 int direction)
9772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009774 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 if (PyUnicode_READY(str) == -1)
9776 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009777 len = PyUnicode_GET_LENGTH(str);
9778 ADJUST_INDICES(start, end, len);
9779 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009780 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009782 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9783 kind, end-start, ch, direction);
9784 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009786 else
9787 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788}
9789
Alexander Belopolsky40018472011-02-26 01:02:56 +00009790static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009791tailmatch(PyObject *self,
9792 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009793 Py_ssize_t start,
9794 Py_ssize_t end,
9795 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 int kind_self;
9798 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009799 const void *data_self;
9800 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 Py_ssize_t offset;
9802 Py_ssize_t i;
9803 Py_ssize_t end_sub;
9804
9805 if (PyUnicode_READY(self) == -1 ||
9806 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009807 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9810 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009812 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009814 if (PyUnicode_GET_LENGTH(substring) == 0)
9815 return 1;
9816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 kind_self = PyUnicode_KIND(self);
9818 data_self = PyUnicode_DATA(self);
9819 kind_sub = PyUnicode_KIND(substring);
9820 data_sub = PyUnicode_DATA(substring);
9821 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9822
9823 if (direction > 0)
9824 offset = end;
9825 else
9826 offset = start;
9827
9828 if (PyUnicode_READ(kind_self, data_self, offset) ==
9829 PyUnicode_READ(kind_sub, data_sub, 0) &&
9830 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9831 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9832 /* If both are of the same kind, memcmp is sufficient */
9833 if (kind_self == kind_sub) {
9834 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009835 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 data_sub,
9837 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009838 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009840 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 else {
9842 /* We do not need to compare 0 and len(substring)-1 because
9843 the if statement above ensured already that they are equal
9844 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 for (i = 1; i < end_sub; ++i) {
9846 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9847 PyUnicode_READ(kind_sub, data_sub, i))
9848 return 0;
9849 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009850 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 }
9853
9854 return 0;
9855}
9856
Alexander Belopolsky40018472011-02-26 01:02:56 +00009857Py_ssize_t
9858PyUnicode_Tailmatch(PyObject *str,
9859 PyObject *substr,
9860 Py_ssize_t start,
9861 Py_ssize_t end,
9862 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009864 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009866
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009867 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868}
9869
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870static PyObject *
9871ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009873 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009874 const char *data = PyUnicode_DATA(self);
9875 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009876 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009877
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878 res = PyUnicode_New(len, 127);
9879 if (res == NULL)
9880 return NULL;
9881 resdata = PyUnicode_DATA(res);
9882 if (lower)
9883 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009885 _Py_bytes_upper(resdata, data, len);
9886 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887}
9888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009890handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009892 Py_ssize_t j;
9893 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009894 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009896
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009897 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9898
9899 where ! is a negation and \p{xxx} is a character with property xxx.
9900 */
9901 for (j = i - 1; j >= 0; j--) {
9902 c = PyUnicode_READ(kind, data, j);
9903 if (!_PyUnicode_IsCaseIgnorable(c))
9904 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009906 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9907 if (final_sigma) {
9908 for (j = i + 1; j < length; j++) {
9909 c = PyUnicode_READ(kind, data, j);
9910 if (!_PyUnicode_IsCaseIgnorable(c))
9911 break;
9912 }
9913 final_sigma = j == length || !_PyUnicode_IsCased(c);
9914 }
9915 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916}
9917
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009918static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009919lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009920 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009922 /* Obscure special case. */
9923 if (c == 0x3A3) {
9924 mapped[0] = handle_capital_sigma(kind, data, length, i);
9925 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009927 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928}
9929
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009930static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009931do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009933 Py_ssize_t i, k = 0;
9934 int n_res, j;
9935 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009936
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009937 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009938 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009940 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009941 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009943 for (i = 1; i < length; i++) {
9944 c = PyUnicode_READ(kind, data, i);
9945 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9946 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009947 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009948 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009949 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009950 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009951 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952}
9953
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009954static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009955do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009956 Py_ssize_t i, k = 0;
9957
9958 for (i = 0; i < length; i++) {
9959 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9960 int n_res, j;
9961 if (Py_UNICODE_ISUPPER(c)) {
9962 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9963 }
9964 else if (Py_UNICODE_ISLOWER(c)) {
9965 n_res = _PyUnicode_ToUpperFull(c, mapped);
9966 }
9967 else {
9968 n_res = 1;
9969 mapped[0] = c;
9970 }
9971 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009972 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009973 res[k++] = mapped[j];
9974 }
9975 }
9976 return k;
9977}
9978
9979static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009980do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009981 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009983 Py_ssize_t i, k = 0;
9984
9985 for (i = 0; i < length; i++) {
9986 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9987 int n_res, j;
9988 if (lower)
9989 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9990 else
9991 n_res = _PyUnicode_ToUpperFull(c, mapped);
9992 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009993 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009994 res[k++] = mapped[j];
9995 }
9996 }
9997 return k;
9998}
9999
10000static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010001do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010002{
10003 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10004}
10005
10006static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010007do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010008{
10009 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10010}
10011
Benjamin Petersone51757f2012-01-12 21:10:29 -050010012static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010013do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010014{
10015 Py_ssize_t i, k = 0;
10016
10017 for (i = 0; i < length; i++) {
10018 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10019 Py_UCS4 mapped[3];
10020 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10021 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010022 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010023 res[k++] = mapped[j];
10024 }
10025 }
10026 return k;
10027}
10028
10029static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010030do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010031{
10032 Py_ssize_t i, k = 0;
10033 int previous_is_cased;
10034
10035 previous_is_cased = 0;
10036 for (i = 0; i < length; i++) {
10037 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10038 Py_UCS4 mapped[3];
10039 int n_res, j;
10040
10041 if (previous_is_cased)
10042 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10043 else
10044 n_res = _PyUnicode_ToTitleFull(c, mapped);
10045
10046 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010047 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010048 res[k++] = mapped[j];
10049 }
10050
10051 previous_is_cased = _PyUnicode_IsCased(c);
10052 }
10053 return k;
10054}
10055
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056static PyObject *
10057case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010058 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010059{
10060 PyObject *res = NULL;
10061 Py_ssize_t length, newlength = 0;
10062 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010063 const void *data;
10064 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10066
Benjamin Petersoneea48462012-01-16 14:28:50 -050010067 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010068
10069 kind = PyUnicode_KIND(self);
10070 data = PyUnicode_DATA(self);
10071 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010072 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010073 PyErr_SetString(PyExc_OverflowError, "string is too long");
10074 return NULL;
10075 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010076 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077 if (tmp == NULL)
10078 return PyErr_NoMemory();
10079 newlength = perform(kind, data, length, tmp, &maxchar);
10080 res = PyUnicode_New(newlength, maxchar);
10081 if (res == NULL)
10082 goto leave;
10083 tmpend = tmp + newlength;
10084 outdata = PyUnicode_DATA(res);
10085 outkind = PyUnicode_KIND(res);
10086 switch (outkind) {
10087 case PyUnicode_1BYTE_KIND:
10088 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10089 break;
10090 case PyUnicode_2BYTE_KIND:
10091 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10092 break;
10093 case PyUnicode_4BYTE_KIND:
10094 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10095 break;
10096 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010097 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010098 }
10099 leave:
10100 PyMem_FREE(tmp);
10101 return res;
10102}
10103
Tim Peters8ce9f162004-08-27 01:49:32 +000010104PyObject *
10105PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010107 PyObject *res;
10108 PyObject *fseq;
10109 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010110 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010112 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010113 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010114 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010115 }
10116
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010117 /* NOTE: the following code can't call back into Python code,
10118 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010119 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010120
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010121 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010122 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010123 res = _PyUnicode_JoinArray(separator, items, seqlen);
10124 Py_DECREF(fseq);
10125 return res;
10126}
10127
10128PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010129_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010130{
10131 PyObject *res = NULL; /* the result */
10132 PyObject *sep = NULL;
10133 Py_ssize_t seplen;
10134 PyObject *item;
10135 Py_ssize_t sz, i, res_offset;
10136 Py_UCS4 maxchar;
10137 Py_UCS4 item_maxchar;
10138 int use_memcpy;
10139 unsigned char *res_data = NULL, *sep_data = NULL;
10140 PyObject *last_obj;
10141 unsigned int kind = 0;
10142
Tim Peters05eba1f2004-08-27 21:32:02 +000010143 /* If empty sequence, return u"". */
10144 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010145 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010146 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010147
Tim Peters05eba1f2004-08-27 21:32:02 +000010148 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010149 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010150 if (seqlen == 1) {
10151 if (PyUnicode_CheckExact(items[0])) {
10152 res = items[0];
10153 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010154 return res;
10155 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010156 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010157 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010158 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010159 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010160 /* Set up sep and seplen */
10161 if (separator == NULL) {
10162 /* fall back to a blank space separator */
10163 sep = PyUnicode_FromOrdinal(' ');
10164 if (!sep)
10165 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010166 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010167 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010168 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010169 else {
10170 if (!PyUnicode_Check(separator)) {
10171 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010172 "separator: expected str instance,"
10173 " %.80s found",
10174 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010175 goto onError;
10176 }
10177 if (PyUnicode_READY(separator))
10178 goto onError;
10179 sep = separator;
10180 seplen = PyUnicode_GET_LENGTH(separator);
10181 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10182 /* inc refcount to keep this code path symmetric with the
10183 above case of a blank separator */
10184 Py_INCREF(sep);
10185 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010186 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010187 }
10188
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010189 /* There are at least two things to join, or else we have a subclass
10190 * of str in the sequence.
10191 * Do a pre-pass to figure out the total amount of space we'll
10192 * need (sz), and see whether all argument are strings.
10193 */
10194 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010195#ifdef Py_DEBUG
10196 use_memcpy = 0;
10197#else
10198 use_memcpy = 1;
10199#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010200 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010201 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010202 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 if (!PyUnicode_Check(item)) {
10204 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010205 "sequence item %zd: expected str instance,"
10206 " %.80s found",
10207 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010208 goto onError;
10209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 if (PyUnicode_READY(item) == -1)
10211 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010212 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010214 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010215 if (i != 0) {
10216 add_sz += seplen;
10217 }
10218 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010219 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010220 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010221 goto onError;
10222 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010223 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010224 if (use_memcpy && last_obj != NULL) {
10225 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10226 use_memcpy = 0;
10227 }
10228 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010229 }
Tim Petersced69f82003-09-16 20:30:58 +000010230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010232 if (res == NULL)
10233 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010234
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010235 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010236#ifdef Py_DEBUG
10237 use_memcpy = 0;
10238#else
10239 if (use_memcpy) {
10240 res_data = PyUnicode_1BYTE_DATA(res);
10241 kind = PyUnicode_KIND(res);
10242 if (seplen != 0)
10243 sep_data = PyUnicode_1BYTE_DATA(sep);
10244 }
10245#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010246 if (use_memcpy) {
10247 for (i = 0; i < seqlen; ++i) {
10248 Py_ssize_t itemlen;
10249 item = items[i];
10250
10251 /* Copy item, and maybe the separator. */
10252 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010253 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010254 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010255 kind * seplen);
10256 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010257 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010258
10259 itemlen = PyUnicode_GET_LENGTH(item);
10260 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010261 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010262 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010263 kind * itemlen);
10264 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010265 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010266 }
10267 assert(res_data == PyUnicode_1BYTE_DATA(res)
10268 + kind * PyUnicode_GET_LENGTH(res));
10269 }
10270 else {
10271 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10272 Py_ssize_t itemlen;
10273 item = items[i];
10274
10275 /* Copy item, and maybe the separator. */
10276 if (i && seplen != 0) {
10277 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10278 res_offset += seplen;
10279 }
10280
10281 itemlen = PyUnicode_GET_LENGTH(item);
10282 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010283 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010284 res_offset += itemlen;
10285 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010286 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010287 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010288 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010291 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293
Benjamin Peterson29060642009-01-31 22:14:21 +000010294 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010296 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 return NULL;
10298}
10299
Victor Stinnerd3f08822012-05-29 12:57:52 +020010300void
10301_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10302 Py_UCS4 fill_char)
10303{
10304 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010305 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010306 assert(PyUnicode_IS_READY(unicode));
10307 assert(unicode_modifiable(unicode));
10308 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10309 assert(start >= 0);
10310 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010311 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010312}
10313
Victor Stinner3fe55312012-01-04 00:33:50 +010010314Py_ssize_t
10315PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10316 Py_UCS4 fill_char)
10317{
10318 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010319
10320 if (!PyUnicode_Check(unicode)) {
10321 PyErr_BadInternalCall();
10322 return -1;
10323 }
10324 if (PyUnicode_READY(unicode) == -1)
10325 return -1;
10326 if (unicode_check_modifiable(unicode))
10327 return -1;
10328
Victor Stinnerd3f08822012-05-29 12:57:52 +020010329 if (start < 0) {
10330 PyErr_SetString(PyExc_IndexError, "string index out of range");
10331 return -1;
10332 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010333 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10334 PyErr_SetString(PyExc_ValueError,
10335 "fill character is bigger than "
10336 "the string maximum character");
10337 return -1;
10338 }
10339
10340 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10341 length = Py_MIN(maxlen, length);
10342 if (length <= 0)
10343 return 0;
10344
Victor Stinnerd3f08822012-05-29 12:57:52 +020010345 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010346 return length;
10347}
10348
Victor Stinner9310abb2011-10-05 00:59:23 +020010349static PyObject *
10350pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010351 Py_ssize_t left,
10352 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 PyObject *u;
10356 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010357 int kind;
10358 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359
10360 if (left < 0)
10361 left = 0;
10362 if (right < 0)
10363 right = 0;
10364
Victor Stinnerc4b49542011-12-11 22:44:26 +010010365 if (left == 0 && right == 0)
10366 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10369 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010370 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10371 return NULL;
10372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010374 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010376 if (!u)
10377 return NULL;
10378
10379 kind = PyUnicode_KIND(u);
10380 data = PyUnicode_DATA(u);
10381 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010382 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010383 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010384 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010385 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010386 assert(_PyUnicode_CheckConsistency(u, 1));
10387 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388}
10389
Alexander Belopolsky40018472011-02-26 01:02:56 +000010390PyObject *
10391PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010395 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397
Benjamin Petersonead6b532011-12-20 17:23:42 -060010398 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010400 if (PyUnicode_IS_ASCII(string))
10401 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010402 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403 PyUnicode_GET_LENGTH(string), keepends);
10404 else
10405 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010406 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010407 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 case PyUnicode_2BYTE_KIND:
10410 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 PyUnicode_GET_LENGTH(string), keepends);
10413 break;
10414 case PyUnicode_4BYTE_KIND:
10415 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010416 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyUnicode_GET_LENGTH(string), keepends);
10418 break;
10419 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010420 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423}
10424
Alexander Belopolsky40018472011-02-26 01:02:56 +000010425static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010426split(PyObject *self,
10427 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010428 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010430 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010431 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 Py_ssize_t len1, len2;
10433 PyObject* out;
10434
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010436 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 if (PyUnicode_READY(self) == -1)
10439 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010442 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010444 if (PyUnicode_IS_ASCII(self))
10445 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010446 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010447 PyUnicode_GET_LENGTH(self), maxcount
10448 );
10449 else
10450 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010451 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010452 PyUnicode_GET_LENGTH(self), maxcount
10453 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 case PyUnicode_2BYTE_KIND:
10455 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010456 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 PyUnicode_GET_LENGTH(self), maxcount
10458 );
10459 case PyUnicode_4BYTE_KIND:
10460 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010461 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 PyUnicode_GET_LENGTH(self), maxcount
10463 );
10464 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010465 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 }
10467
10468 if (PyUnicode_READY(substring) == -1)
10469 return NULL;
10470
10471 kind1 = PyUnicode_KIND(self);
10472 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 len1 = PyUnicode_GET_LENGTH(self);
10474 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010475 if (kind1 < kind2 || len1 < len2) {
10476 out = PyList_New(1);
10477 if (out == NULL)
10478 return NULL;
10479 Py_INCREF(self);
10480 PyList_SET_ITEM(out, 0, self);
10481 return out;
10482 }
10483 buf1 = PyUnicode_DATA(self);
10484 buf2 = PyUnicode_DATA(substring);
10485 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010486 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010487 if (!buf2)
10488 return NULL;
10489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010491 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010493 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10494 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010495 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010496 else
10497 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010498 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 break;
10500 case PyUnicode_2BYTE_KIND:
10501 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010502 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 break;
10504 case PyUnicode_4BYTE_KIND:
10505 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010506 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 break;
10508 default:
10509 out = NULL;
10510 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010511 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010512 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010513 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515}
10516
Alexander Belopolsky40018472011-02-26 01:02:56 +000010517static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010518rsplit(PyObject *self,
10519 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010520 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010521{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010522 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010523 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 Py_ssize_t len1, len2;
10525 PyObject* out;
10526
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010527 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010528 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 if (PyUnicode_READY(self) == -1)
10531 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010534 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010536 if (PyUnicode_IS_ASCII(self))
10537 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010538 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010539 PyUnicode_GET_LENGTH(self), maxcount
10540 );
10541 else
10542 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010543 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010544 PyUnicode_GET_LENGTH(self), maxcount
10545 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 case PyUnicode_2BYTE_KIND:
10547 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010548 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 PyUnicode_GET_LENGTH(self), maxcount
10550 );
10551 case PyUnicode_4BYTE_KIND:
10552 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010553 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 PyUnicode_GET_LENGTH(self), maxcount
10555 );
10556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010557 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 }
10559
10560 if (PyUnicode_READY(substring) == -1)
10561 return NULL;
10562
10563 kind1 = PyUnicode_KIND(self);
10564 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 len1 = PyUnicode_GET_LENGTH(self);
10566 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010567 if (kind1 < kind2 || len1 < len2) {
10568 out = PyList_New(1);
10569 if (out == NULL)
10570 return NULL;
10571 Py_INCREF(self);
10572 PyList_SET_ITEM(out, 0, self);
10573 return out;
10574 }
10575 buf1 = PyUnicode_DATA(self);
10576 buf2 = PyUnicode_DATA(substring);
10577 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010578 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010579 if (!buf2)
10580 return NULL;
10581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010583 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010585 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10586 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010587 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 else
10589 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010590 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 break;
10592 case PyUnicode_2BYTE_KIND:
10593 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010594 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 break;
10596 case PyUnicode_4BYTE_KIND:
10597 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010598 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 break;
10600 default:
10601 out = NULL;
10602 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010603 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010604 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010605 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 return out;
10607}
10608
10609static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010610anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10611 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010613 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010615 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10616 return asciilib_find(buf1, len1, buf2, len2, offset);
10617 else
10618 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 case PyUnicode_2BYTE_KIND:
10620 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10621 case PyUnicode_4BYTE_KIND:
10622 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10623 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010624 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625}
10626
10627static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010628anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10629 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010631 switch (kind) {
10632 case PyUnicode_1BYTE_KIND:
10633 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10634 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10635 else
10636 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10637 case PyUnicode_2BYTE_KIND:
10638 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10639 case PyUnicode_4BYTE_KIND:
10640 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10641 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010642 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010643}
10644
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010645static void
10646replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10647 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10648{
10649 int kind = PyUnicode_KIND(u);
10650 void *data = PyUnicode_DATA(u);
10651 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10652 if (kind == PyUnicode_1BYTE_KIND) {
10653 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10654 (Py_UCS1 *)data + len,
10655 u1, u2, maxcount);
10656 }
10657 else if (kind == PyUnicode_2BYTE_KIND) {
10658 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10659 (Py_UCS2 *)data + len,
10660 u1, u2, maxcount);
10661 }
10662 else {
10663 assert(kind == PyUnicode_4BYTE_KIND);
10664 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10665 (Py_UCS4 *)data + len,
10666 u1, u2, maxcount);
10667 }
10668}
10669
Alexander Belopolsky40018472011-02-26 01:02:56 +000010670static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671replace(PyObject *self, PyObject *str1,
10672 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010675 const char *sbuf = PyUnicode_DATA(self);
10676 const void *buf1 = PyUnicode_DATA(str1);
10677 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 int srelease = 0, release1 = 0, release2 = 0;
10679 int skind = PyUnicode_KIND(self);
10680 int kind1 = PyUnicode_KIND(str1);
10681 int kind2 = PyUnicode_KIND(str2);
10682 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10683 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10684 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010685 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010686 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010688 if (slen < len1)
10689 goto nothing;
10690
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010692 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010693 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010694 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
Victor Stinner59de0ee2011-10-07 10:01:28 +020010696 if (str1 == str2)
10697 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698
Victor Stinner49a0a212011-10-12 23:46:10 +020010699 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010700 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10701 if (maxchar < maxchar_str1)
10702 /* substring too wide to be present */
10703 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010704 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10705 /* Replacing str1 with str2 may cause a maxchar reduction in the
10706 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010707 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010708 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010711 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010713 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010715 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010716 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010717 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010718
Victor Stinner69ed0f42013-04-09 21:48:24 +020010719 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010720 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010721 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010723 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010727
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010728 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10729 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010730 }
10731 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 int rkind = skind;
10733 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010734 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 if (kind1 < rkind) {
10737 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010738 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (!buf1) goto error;
10740 release1 = 1;
10741 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010742 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010743 if (i < 0)
10744 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (rkind > kind2) {
10746 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010747 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 if (!buf2) goto error;
10749 release2 = 1;
10750 }
10751 else if (rkind < kind2) {
10752 /* widen self and buf1 */
10753 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010754 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010755 assert(buf1 != PyUnicode_DATA(str1));
10756 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010757 buf1 = PyUnicode_DATA(str1);
10758 release1 = 0;
10759 }
10760 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 if (!sbuf) goto error;
10762 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010763 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 if (!buf1) goto error;
10765 release1 = 1;
10766 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010767 u = PyUnicode_New(slen, maxchar);
10768 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010770 assert(PyUnicode_KIND(u) == rkind);
10771 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010772
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010773 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010774 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010775 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010777 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010779
10780 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010781 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010782 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010783 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010784 if (i == -1)
10785 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010786 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010788 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 }
10793 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010795 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 int rkind = skind;
10797 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010800 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010801 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (!buf1) goto error;
10803 release1 = 1;
10804 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010805 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010806 if (n == 0)
10807 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010809 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010810 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (!buf2) goto error;
10812 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010815 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010817 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 if (!sbuf) goto error;
10819 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010820 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010821 assert(buf1 != PyUnicode_DATA(str1));
10822 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010823 buf1 = PyUnicode_DATA(str1);
10824 release1 = 0;
10825 }
10826 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 if (!buf1) goto error;
10828 release1 = 1;
10829 }
10830 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10831 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010832 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 PyErr_SetString(PyExc_OverflowError,
10834 "replace string is too long");
10835 goto error;
10836 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010837 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010838 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010839 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010840 goto done;
10841 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010842 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 PyErr_SetString(PyExc_OverflowError,
10844 "replace string is too long");
10845 goto error;
10846 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010847 u = PyUnicode_New(new_size, maxchar);
10848 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010850 assert(PyUnicode_KIND(u) == rkind);
10851 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 ires = i = 0;
10853 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010854 while (n-- > 0) {
10855 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010856 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010857 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010858 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010859 if (j == -1)
10860 break;
10861 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010862 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010863 memcpy(res + rkind * ires,
10864 sbuf + rkind * i,
10865 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010867 }
10868 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010870 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010872 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010878 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010879 memcpy(res + rkind * ires,
10880 sbuf + rkind * i,
10881 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010882 }
10883 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010884 /* interleave */
10885 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010886 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010888 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010890 if (--n <= 0)
10891 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010892 memcpy(res + rkind * ires,
10893 sbuf + rkind * i,
10894 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 ires++;
10896 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010897 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010898 memcpy(res + rkind * ires,
10899 sbuf + rkind * i,
10900 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010901 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010902 }
10903
10904 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010905 unicode_adjust_maxchar(&u);
10906 if (u == NULL)
10907 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010909
10910 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010911 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10912 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10913 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010915 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010917 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010919 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010920 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922
Benjamin Peterson29060642009-01-31 22:14:21 +000010923 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010925 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10926 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10927 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010929 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010931 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010933 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010934 return unicode_result_unchanged(self);
10935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010937 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10938 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10939 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10940 if (srelease)
10941 PyMem_FREE((void *)sbuf);
10942 if (release1)
10943 PyMem_FREE((void *)buf1);
10944 if (release2)
10945 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947}
10948
10949/* --- Unicode Object Methods --------------------------------------------- */
10950
INADA Naoki3ae20562017-01-16 20:41:20 +090010951/*[clinic input]
10952str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
INADA Naoki3ae20562017-01-16 20:41:20 +090010954Return a version of the string where each word is titlecased.
10955
10956More specifically, words start with uppercased characters and all remaining
10957cased characters have lower case.
10958[clinic start generated code]*/
10959
10960static PyObject *
10961unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010962/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010964 if (PyUnicode_READY(self) == -1)
10965 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010966 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967}
10968
INADA Naoki3ae20562017-01-16 20:41:20 +090010969/*[clinic input]
10970str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971
INADA Naoki3ae20562017-01-16 20:41:20 +090010972Return a capitalized version of the string.
10973
10974More specifically, make the first character have upper case and the rest lower
10975case.
10976[clinic start generated code]*/
10977
10978static PyObject *
10979unicode_capitalize_impl(PyObject *self)
10980/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010982 if (PyUnicode_READY(self) == -1)
10983 return NULL;
10984 if (PyUnicode_GET_LENGTH(self) == 0)
10985 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010986 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987}
10988
INADA Naoki3ae20562017-01-16 20:41:20 +090010989/*[clinic input]
10990str.casefold as unicode_casefold
10991
10992Return a version of the string suitable for caseless comparisons.
10993[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010994
10995static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010996unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010997/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010998{
10999 if (PyUnicode_READY(self) == -1)
11000 return NULL;
11001 if (PyUnicode_IS_ASCII(self))
11002 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011003 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011004}
11005
11006
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011007/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011008
11009static int
11010convert_uc(PyObject *obj, void *addr)
11011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011013
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011014 if (!PyUnicode_Check(obj)) {
11015 PyErr_Format(PyExc_TypeError,
11016 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011017 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011018 return 0;
11019 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011020 if (PyUnicode_READY(obj) < 0)
11021 return 0;
11022 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011023 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011025 return 0;
11026 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011027 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011028 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011029}
11030
INADA Naoki3ae20562017-01-16 20:41:20 +090011031/*[clinic input]
11032str.center as unicode_center
11033
11034 width: Py_ssize_t
11035 fillchar: Py_UCS4 = ' '
11036 /
11037
11038Return a centered string of length width.
11039
11040Padding is done using the specified fill character (default is a space).
11041[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042
11043static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011044unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11045/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011047 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
Benjamin Petersonbac79492012-01-14 13:34:47 -050011049 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 return NULL;
11051
Victor Stinnerc4b49542011-12-11 22:44:26 +010011052 if (PyUnicode_GET_LENGTH(self) >= width)
11053 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
Victor Stinnerc4b49542011-12-11 22:44:26 +010011055 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 left = marg / 2 + (marg & width & 1);
11057
Victor Stinner9310abb2011-10-05 00:59:23 +020011058 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059}
11060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061/* This function assumes that str1 and str2 are readied by the caller. */
11062
Marc-André Lemburge5034372000-08-08 08:04:29 +000011063static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011064unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011065{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011066#define COMPARE(TYPE1, TYPE2) \
11067 do { \
11068 TYPE1* p1 = (TYPE1 *)data1; \
11069 TYPE2* p2 = (TYPE2 *)data2; \
11070 TYPE1* end = p1 + len; \
11071 Py_UCS4 c1, c2; \
11072 for (; p1 != end; p1++, p2++) { \
11073 c1 = *p1; \
11074 c2 = *p2; \
11075 if (c1 != c2) \
11076 return (c1 < c2) ? -1 : 1; \
11077 } \
11078 } \
11079 while (0)
11080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011082 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011083 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 kind1 = PyUnicode_KIND(str1);
11086 kind2 = PyUnicode_KIND(str2);
11087 data1 = PyUnicode_DATA(str1);
11088 data2 = PyUnicode_DATA(str2);
11089 len1 = PyUnicode_GET_LENGTH(str1);
11090 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011091 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011092
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011093 switch(kind1) {
11094 case PyUnicode_1BYTE_KIND:
11095 {
11096 switch(kind2) {
11097 case PyUnicode_1BYTE_KIND:
11098 {
11099 int cmp = memcmp(data1, data2, len);
11100 /* normalize result of memcmp() into the range [-1; 1] */
11101 if (cmp < 0)
11102 return -1;
11103 if (cmp > 0)
11104 return 1;
11105 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011106 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011107 case PyUnicode_2BYTE_KIND:
11108 COMPARE(Py_UCS1, Py_UCS2);
11109 break;
11110 case PyUnicode_4BYTE_KIND:
11111 COMPARE(Py_UCS1, Py_UCS4);
11112 break;
11113 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011114 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011115 }
11116 break;
11117 }
11118 case PyUnicode_2BYTE_KIND:
11119 {
11120 switch(kind2) {
11121 case PyUnicode_1BYTE_KIND:
11122 COMPARE(Py_UCS2, Py_UCS1);
11123 break;
11124 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011125 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011126 COMPARE(Py_UCS2, Py_UCS2);
11127 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011128 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011129 case PyUnicode_4BYTE_KIND:
11130 COMPARE(Py_UCS2, Py_UCS4);
11131 break;
11132 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011133 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011134 }
11135 break;
11136 }
11137 case PyUnicode_4BYTE_KIND:
11138 {
11139 switch(kind2) {
11140 case PyUnicode_1BYTE_KIND:
11141 COMPARE(Py_UCS4, Py_UCS1);
11142 break;
11143 case PyUnicode_2BYTE_KIND:
11144 COMPARE(Py_UCS4, Py_UCS2);
11145 break;
11146 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011147 {
11148#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11149 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11150 /* normalize result of wmemcmp() into the range [-1; 1] */
11151 if (cmp < 0)
11152 return -1;
11153 if (cmp > 0)
11154 return 1;
11155#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011156 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011157#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011158 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011159 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011160 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011161 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011162 }
11163 break;
11164 }
11165 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011166 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011167 }
11168
Victor Stinner770e19e2012-10-04 22:59:45 +020011169 if (len1 == len2)
11170 return 0;
11171 if (len1 < len2)
11172 return -1;
11173 else
11174 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011175
11176#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011177}
11178
Benjamin Peterson621b4302016-09-09 13:54:34 -070011179static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011180unicode_compare_eq(PyObject *str1, PyObject *str2)
11181{
11182 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011183 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011184 Py_ssize_t len;
11185 int cmp;
11186
Victor Stinnere5567ad2012-10-23 02:48:49 +020011187 len = PyUnicode_GET_LENGTH(str1);
11188 if (PyUnicode_GET_LENGTH(str2) != len)
11189 return 0;
11190 kind = PyUnicode_KIND(str1);
11191 if (PyUnicode_KIND(str2) != kind)
11192 return 0;
11193 data1 = PyUnicode_DATA(str1);
11194 data2 = PyUnicode_DATA(str2);
11195
11196 cmp = memcmp(data1, data2, len * kind);
11197 return (cmp == 0);
11198}
11199
11200
Alexander Belopolsky40018472011-02-26 01:02:56 +000011201int
11202PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11205 if (PyUnicode_READY(left) == -1 ||
11206 PyUnicode_READY(right) == -1)
11207 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011208
11209 /* a string is equal to itself */
11210 if (left == right)
11211 return 0;
11212
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011213 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011215 PyErr_Format(PyExc_TypeError,
11216 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011217 Py_TYPE(left)->tp_name,
11218 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 return -1;
11220}
11221
Martin v. Löwis5b222132007-06-10 09:51:05 +000011222int
11223PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 Py_ssize_t i;
11226 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011228 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229
Victor Stinner910337b2011-10-03 03:20:16 +020011230 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011231 if (!PyUnicode_IS_READY(uni)) {
11232 const wchar_t *ws = _PyUnicode_WSTR(uni);
11233 /* Compare Unicode string and source character set string */
11234 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11235 if (chr != ustr[i])
11236 return (chr < ustr[i]) ? -1 : 1;
11237 }
11238 /* This check keeps Python strings that end in '\0' from comparing equal
11239 to C strings identical up to that point. */
11240 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11241 return 1; /* uni is longer */
11242 if (ustr[i])
11243 return -1; /* str is longer */
11244 return 0;
11245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011247 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011248 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011249 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011250 size_t len, len2 = strlen(str);
11251 int cmp;
11252
11253 len = Py_MIN(len1, len2);
11254 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011255 if (cmp != 0) {
11256 if (cmp < 0)
11257 return -1;
11258 else
11259 return 1;
11260 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011261 if (len1 > len2)
11262 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011263 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011264 return -1; /* str is longer */
11265 return 0;
11266 }
11267 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011268 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011269 /* Compare Unicode string and source character set string */
11270 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011271 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011272 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11273 /* This check keeps Python strings that end in '\0' from comparing equal
11274 to C strings identical up to that point. */
11275 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11276 return 1; /* uni is longer */
11277 if (str[i])
11278 return -1; /* str is longer */
11279 return 0;
11280 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011281}
11282
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011283static int
11284non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11285{
11286 size_t i, len;
11287 const wchar_t *p;
11288 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11289 if (strlen(str) != len)
11290 return 0;
11291 p = _PyUnicode_WSTR(unicode);
11292 assert(p);
11293 for (i = 0; i < len; i++) {
11294 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011295 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011296 return 0;
11297 }
11298 return 1;
11299}
11300
11301int
11302_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11303{
11304 size_t len;
11305 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011306 assert(str);
11307#ifndef NDEBUG
11308 for (const char *p = str; *p; p++) {
11309 assert((unsigned char)*p < 128);
11310 }
11311#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011312 if (PyUnicode_READY(unicode) == -1) {
11313 /* Memory error or bad data */
11314 PyErr_Clear();
11315 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11316 }
11317 if (!PyUnicode_IS_ASCII(unicode))
11318 return 0;
11319 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11320 return strlen(str) == len &&
11321 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11322}
11323
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011324int
11325_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11326{
11327 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011328
11329 assert(_PyUnicode_CHECK(left));
11330 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011331#ifndef NDEBUG
11332 for (const char *p = right->string; *p; p++) {
11333 assert((unsigned char)*p < 128);
11334 }
11335#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011336
11337 if (PyUnicode_READY(left) == -1) {
11338 /* memory error or bad data */
11339 PyErr_Clear();
11340 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11341 }
11342
11343 if (!PyUnicode_IS_ASCII(left))
11344 return 0;
11345
11346 right_uni = _PyUnicode_FromId(right); /* borrowed */
11347 if (right_uni == NULL) {
11348 /* memory error or bad data */
11349 PyErr_Clear();
11350 return _PyUnicode_EqualToASCIIString(left, right->string);
11351 }
11352
11353 if (left == right_uni)
11354 return 1;
11355
11356 if (PyUnicode_CHECK_INTERNED(left))
11357 return 0;
11358
Victor Stinner607b1022020-05-05 18:50:30 +020011359#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011360 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011361 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011362 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11363 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011364#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011365
11366 return unicode_compare_eq(left, right_uni);
11367}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011368
Alexander Belopolsky40018472011-02-26 01:02:56 +000011369PyObject *
11370PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011371{
11372 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011373
Victor Stinnere5567ad2012-10-23 02:48:49 +020011374 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11375 Py_RETURN_NOTIMPLEMENTED;
11376
11377 if (PyUnicode_READY(left) == -1 ||
11378 PyUnicode_READY(right) == -1)
11379 return NULL;
11380
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011381 if (left == right) {
11382 switch (op) {
11383 case Py_EQ:
11384 case Py_LE:
11385 case Py_GE:
11386 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011387 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011388 case Py_NE:
11389 case Py_LT:
11390 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011391 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011392 default:
11393 PyErr_BadArgument();
11394 return NULL;
11395 }
11396 }
11397 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011398 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011399 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011400 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011401 }
11402 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011403 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011404 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011405 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011406}
11407
Alexander Belopolsky40018472011-02-26 01:02:56 +000011408int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011409_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11410{
11411 return unicode_eq(aa, bb);
11412}
11413
11414int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011415PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011416{
Victor Stinner77282cb2013-04-14 19:22:47 +020011417 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011418 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011420 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011421
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011422 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011424 "'in <string>' requires string as left operand, not %.100s",
11425 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011426 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011427 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011428 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011429 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011430 if (ensure_unicode(str) < 0)
11431 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011434 kind2 = PyUnicode_KIND(substr);
11435 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011436 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011438 len2 = PyUnicode_GET_LENGTH(substr);
11439 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011440 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011441 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011442 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011443 if (len2 == 1) {
11444 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11445 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011446 return result;
11447 }
11448 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011449 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011450 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011451 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453
Victor Stinner77282cb2013-04-14 19:22:47 +020011454 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 case PyUnicode_1BYTE_KIND:
11456 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11457 break;
11458 case PyUnicode_2BYTE_KIND:
11459 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11460 break;
11461 case PyUnicode_4BYTE_KIND:
11462 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11463 break;
11464 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011465 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011467
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011468 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011469 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011470 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471
Guido van Rossum403d68b2000-03-13 15:55:09 +000011472 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011473}
11474
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475/* Concat to string or Unicode object giving a new Unicode object. */
11476
Alexander Belopolsky40018472011-02-26 01:02:56 +000011477PyObject *
11478PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011480 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011481 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011482 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011484 if (ensure_unicode(left) < 0)
11485 return NULL;
11486
11487 if (!PyUnicode_Check(right)) {
11488 PyErr_Format(PyExc_TypeError,
11489 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011490 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011491 return NULL;
11492 }
11493 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011497 PyObject *empty = unicode_get_empty(); // Borrowed reference
11498 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011499 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011500 }
11501 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011502 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011505 left_len = PyUnicode_GET_LENGTH(left);
11506 right_len = PyUnicode_GET_LENGTH(right);
11507 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011508 PyErr_SetString(PyExc_OverflowError,
11509 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011510 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011511 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011512 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011513
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011514 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11515 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011516 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011519 result = PyUnicode_New(new_len, maxchar);
11520 if (result == NULL)
11521 return NULL;
11522 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11523 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11524 assert(_PyUnicode_CheckConsistency(result, 1));
11525 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526}
11527
Walter Dörwald1ab83302007-05-18 17:15:44 +000011528void
Victor Stinner23e56682011-10-03 03:54:37 +020011529PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011530{
Victor Stinner23e56682011-10-03 03:54:37 +020011531 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011532 Py_UCS4 maxchar, maxchar2;
11533 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011534
11535 if (p_left == NULL) {
11536 if (!PyErr_Occurred())
11537 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011538 return;
11539 }
Victor Stinner23e56682011-10-03 03:54:37 +020011540 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011541 if (right == NULL || left == NULL
11542 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011543 if (!PyErr_Occurred())
11544 PyErr_BadInternalCall();
11545 goto error;
11546 }
11547
Benjamin Petersonbac79492012-01-14 13:34:47 -050011548 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011549 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011550 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011551 goto error;
11552
Victor Stinner488fa492011-12-12 00:01:39 +010011553 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011554 PyObject *empty = unicode_get_empty(); // Borrowed reference
11555 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011556 Py_DECREF(left);
11557 Py_INCREF(right);
11558 *p_left = right;
11559 return;
11560 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011561 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011562 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011563 }
Victor Stinner488fa492011-12-12 00:01:39 +010011564
11565 left_len = PyUnicode_GET_LENGTH(left);
11566 right_len = PyUnicode_GET_LENGTH(right);
11567 if (left_len > PY_SSIZE_T_MAX - right_len) {
11568 PyErr_SetString(PyExc_OverflowError,
11569 "strings are too large to concat");
11570 goto error;
11571 }
11572 new_len = left_len + right_len;
11573
11574 if (unicode_modifiable(left)
11575 && PyUnicode_CheckExact(right)
11576 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011577 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11578 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011579 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011580 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011581 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11582 {
11583 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011584 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011585 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011586
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011587 /* copy 'right' into the newly allocated area of 'left' */
11588 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011589 }
Victor Stinner488fa492011-12-12 00:01:39 +010011590 else {
11591 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11592 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011593 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011594
Victor Stinner488fa492011-12-12 00:01:39 +010011595 /* Concat the two Unicode strings */
11596 res = PyUnicode_New(new_len, maxchar);
11597 if (res == NULL)
11598 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011599 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11600 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011601 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011602 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011603 }
11604 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011605 return;
11606
11607error:
Victor Stinner488fa492011-12-12 00:01:39 +010011608 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011609}
11610
11611void
11612PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11613{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011614 PyUnicode_Append(pleft, right);
11615 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011616}
11617
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011618/*
11619Wraps stringlib_parse_args_finds() and additionally ensures that the
11620first argument is a unicode object.
11621*/
11622
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011623static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011624parse_args_finds_unicode(const char * function_name, PyObject *args,
11625 PyObject **substring,
11626 Py_ssize_t *start, Py_ssize_t *end)
11627{
11628 if(stringlib_parse_args_finds(function_name, args, substring,
11629 start, end)) {
11630 if (ensure_unicode(*substring) < 0)
11631 return 0;
11632 return 1;
11633 }
11634 return 0;
11635}
11636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011637PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011640Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011641string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011642interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
11644static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011645unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011647 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011648 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011649 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011651 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011652 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011655 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 kind1 = PyUnicode_KIND(self);
11659 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011660 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011661 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 len1 = PyUnicode_GET_LENGTH(self);
11664 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011666 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011667 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011668
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011669 buf1 = PyUnicode_DATA(self);
11670 buf2 = PyUnicode_DATA(substring);
11671 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011672 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011673 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011674 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011675 }
11676 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 case PyUnicode_1BYTE_KIND:
11678 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011679 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 buf2, len2, PY_SSIZE_T_MAX
11681 );
11682 break;
11683 case PyUnicode_2BYTE_KIND:
11684 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011685 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 buf2, len2, PY_SSIZE_T_MAX
11687 );
11688 break;
11689 case PyUnicode_4BYTE_KIND:
11690 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011691 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 buf2, len2, PY_SSIZE_T_MAX
11693 );
11694 break;
11695 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011696 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 }
11698
11699 result = PyLong_FromSsize_t(iresult);
11700
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011701 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011702 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011703 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 return result;
11706}
11707
INADA Naoki3ae20562017-01-16 20:41:20 +090011708/*[clinic input]
11709str.encode as unicode_encode
11710
11711 encoding: str(c_default="NULL") = 'utf-8'
11712 The encoding in which to encode the string.
11713 errors: str(c_default="NULL") = 'strict'
11714 The error handling scheme to use for encoding errors.
11715 The default is 'strict' meaning that encoding errors raise a
11716 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11717 'xmlcharrefreplace' as well as any other name registered with
11718 codecs.register_error that can handle UnicodeEncodeErrors.
11719
11720Encode the string using the codec registered for encoding.
11721[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722
11723static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011724unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011725/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011727 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011728}
11729
INADA Naoki3ae20562017-01-16 20:41:20 +090011730/*[clinic input]
11731str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732
INADA Naoki3ae20562017-01-16 20:41:20 +090011733 tabsize: int = 8
11734
11735Return a copy where all tab characters are expanded using spaces.
11736
11737If tabsize is not given, a tab size of 8 characters is assumed.
11738[clinic start generated code]*/
11739
11740static PyObject *
11741unicode_expandtabs_impl(PyObject *self, int tabsize)
11742/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011744 Py_ssize_t i, j, line_pos, src_len, incr;
11745 Py_UCS4 ch;
11746 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011747 const void *src_data;
11748 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011749 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011750 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751
Antoine Pitrou22425222011-10-04 19:10:51 +020011752 if (PyUnicode_READY(self) == -1)
11753 return NULL;
11754
Thomas Wouters7e474022000-07-16 12:04:32 +000011755 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011756 src_len = PyUnicode_GET_LENGTH(self);
11757 i = j = line_pos = 0;
11758 kind = PyUnicode_KIND(self);
11759 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011760 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011761 for (; i < src_len; i++) {
11762 ch = PyUnicode_READ(kind, src_data, i);
11763 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011764 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011765 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011766 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011768 goto overflow;
11769 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011771 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011775 goto overflow;
11776 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011778 if (ch == '\n' || ch == '\r')
11779 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011781 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011782 if (!found)
11783 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011786 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 if (!u)
11788 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011789 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
Antoine Pitroue71d5742011-10-04 15:55:09 +020011791 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792
Antoine Pitroue71d5742011-10-04 15:55:09 +020011793 for (; i < src_len; i++) {
11794 ch = PyUnicode_READ(kind, src_data, i);
11795 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011797 incr = tabsize - (line_pos % tabsize);
11798 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011799 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011800 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011804 line_pos++;
11805 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011806 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011807 if (ch == '\n' || ch == '\r')
11808 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011810 }
11811 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011812 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011813
Antoine Pitroue71d5742011-10-04 15:55:09 +020011814 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011815 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817}
11818
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011819PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821\n\
11822Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011823such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824arguments start and end are interpreted as in slice notation.\n\
11825\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011826Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
11828static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011831 /* initialize variables to prevent gcc warning */
11832 PyObject *substring = NULL;
11833 Py_ssize_t start = 0;
11834 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011835 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011837 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011840 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011843 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 if (result == -2)
11846 return NULL;
11847
Christian Heimes217cfd12007-12-02 14:31:20 +000011848 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849}
11850
11851static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011852unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011854 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011855 enum PyUnicode_Kind kind;
11856 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011857
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011858 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011859 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011861 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011862 if (PyUnicode_READY(self) == -1) {
11863 return NULL;
11864 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011865 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11866 PyErr_SetString(PyExc_IndexError, "string index out of range");
11867 return NULL;
11868 }
11869 kind = PyUnicode_KIND(self);
11870 data = PyUnicode_DATA(self);
11871 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011872 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873}
11874
Guido van Rossumc2504932007-09-18 19:42:40 +000011875/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011876 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011877static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011878unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011880 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011881
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011882#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011883 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011884#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 if (_PyUnicode_HASH(self) != -1)
11886 return _PyUnicode_HASH(self);
11887 if (PyUnicode_READY(self) == -1)
11888 return -1;
animalizea1d14252019-01-02 20:16:06 +080011889
Christian Heimes985ecdc2013-11-20 11:46:18 +010011890 x = _Py_HashBytes(PyUnicode_DATA(self),
11891 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011893 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
oldkaa0735f2018-02-02 16:52:55 +080011899Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011900such that sub is contained within S[start:end]. Optional\n\
11901arguments start and end are interpreted as in slice notation.\n\
11902\n\
11903Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
11905static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011908 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011909 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011910 PyObject *substring = NULL;
11911 Py_ssize_t start = 0;
11912 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011914 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011917 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011920 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 if (result == -2)
11923 return NULL;
11924
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 if (result < 0) {
11926 PyErr_SetString(PyExc_ValueError, "substring not found");
11927 return NULL;
11928 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011929
Christian Heimes217cfd12007-12-02 14:31:20 +000011930 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931}
11932
INADA Naoki3ae20562017-01-16 20:41:20 +090011933/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011934str.isascii as unicode_isascii
11935
11936Return True if all characters in the string are ASCII, False otherwise.
11937
11938ASCII characters have code points in the range U+0000-U+007F.
11939Empty string is ASCII too.
11940[clinic start generated code]*/
11941
11942static PyObject *
11943unicode_isascii_impl(PyObject *self)
11944/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11945{
11946 if (PyUnicode_READY(self) == -1) {
11947 return NULL;
11948 }
11949 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11950}
11951
11952/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011953str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
INADA Naoki3ae20562017-01-16 20:41:20 +090011955Return True if the string is a lowercase string, False otherwise.
11956
11957A string is lowercase if all cased characters in the string are lowercase and
11958there is at least one cased character in the string.
11959[clinic start generated code]*/
11960
11961static PyObject *
11962unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011963/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 Py_ssize_t i, length;
11966 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011967 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 int cased;
11969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READY(self) == -1)
11971 return NULL;
11972 length = PyUnicode_GET_LENGTH(self);
11973 kind = PyUnicode_KIND(self);
11974 data = PyUnicode_DATA(self);
11975
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (length == 1)
11978 return PyBool_FromLong(
11979 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011981 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011983 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011984
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 for (i = 0; i < length; i++) {
11987 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011988
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011990 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 else if (!cased && Py_UNICODE_ISLOWER(ch))
11992 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011994 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995}
11996
INADA Naoki3ae20562017-01-16 20:41:20 +090011997/*[clinic input]
11998str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
INADA Naoki3ae20562017-01-16 20:41:20 +090012000Return True if the string is an uppercase string, False otherwise.
12001
12002A string is uppercase if all cased characters in the string are uppercase and
12003there is at least one cased character in the string.
12004[clinic start generated code]*/
12005
12006static PyObject *
12007unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012008/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 Py_ssize_t i, length;
12011 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012012 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 int cased;
12014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 if (PyUnicode_READY(self) == -1)
12016 return NULL;
12017 length = PyUnicode_GET_LENGTH(self);
12018 kind = PyUnicode_KIND(self);
12019 data = PyUnicode_DATA(self);
12020
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 if (length == 1)
12023 return PyBool_FromLong(
12024 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012026 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012028 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012029
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 for (i = 0; i < length; i++) {
12032 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012033
Benjamin Peterson29060642009-01-31 22:14:21 +000012034 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012035 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 else if (!cased && Py_UNICODE_ISUPPER(ch))
12037 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012039 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040}
12041
INADA Naoki3ae20562017-01-16 20:41:20 +090012042/*[clinic input]
12043str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044
INADA Naoki3ae20562017-01-16 20:41:20 +090012045Return True if the string is a title-cased string, False otherwise.
12046
12047In a title-cased string, upper- and title-case characters may only
12048follow uncased characters and lowercase characters only cased ones.
12049[clinic start generated code]*/
12050
12051static PyObject *
12052unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012053/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 Py_ssize_t i, length;
12056 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012057 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 int cased, previous_is_cased;
12059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (PyUnicode_READY(self) == -1)
12061 return NULL;
12062 length = PyUnicode_GET_LENGTH(self);
12063 kind = PyUnicode_KIND(self);
12064 data = PyUnicode_DATA(self);
12065
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (length == 1) {
12068 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12069 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12070 (Py_UNICODE_ISUPPER(ch) != 0));
12071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012073 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012075 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012076
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 cased = 0;
12078 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 for (i = 0; i < length; i++) {
12080 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012081
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12083 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012084 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 previous_is_cased = 1;
12086 cased = 1;
12087 }
12088 else if (Py_UNICODE_ISLOWER(ch)) {
12089 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012090 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 previous_is_cased = 1;
12092 cased = 1;
12093 }
12094 else
12095 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012097 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098}
12099
INADA Naoki3ae20562017-01-16 20:41:20 +090012100/*[clinic input]
12101str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
INADA Naoki3ae20562017-01-16 20:41:20 +090012103Return True if the string is a whitespace string, False otherwise.
12104
12105A string is whitespace if all characters in the string are whitespace and there
12106is at least one character in the string.
12107[clinic start generated code]*/
12108
12109static PyObject *
12110unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012111/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 Py_ssize_t i, length;
12114 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012115 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116
12117 if (PyUnicode_READY(self) == -1)
12118 return NULL;
12119 length = PyUnicode_GET_LENGTH(self);
12120 kind = PyUnicode_KIND(self);
12121 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 if (length == 1)
12125 return PyBool_FromLong(
12126 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012128 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012130 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 for (i = 0; i < length; i++) {
12133 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012134 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012135 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012137 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138}
12139
INADA Naoki3ae20562017-01-16 20:41:20 +090012140/*[clinic input]
12141str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012142
INADA Naoki3ae20562017-01-16 20:41:20 +090012143Return True if the string is an alphabetic string, False otherwise.
12144
12145A string is alphabetic if all characters in the string are alphabetic and there
12146is at least one character in the string.
12147[clinic start generated code]*/
12148
12149static PyObject *
12150unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012151/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 Py_ssize_t i, length;
12154 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012155 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156
12157 if (PyUnicode_READY(self) == -1)
12158 return NULL;
12159 length = PyUnicode_GET_LENGTH(self);
12160 kind = PyUnicode_KIND(self);
12161 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012162
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012163 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (length == 1)
12165 return PyBool_FromLong(
12166 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012167
12168 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012170 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 for (i = 0; i < length; i++) {
12173 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012174 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012175 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012176 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012177}
12178
INADA Naoki3ae20562017-01-16 20:41:20 +090012179/*[clinic input]
12180str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012181
INADA Naoki3ae20562017-01-16 20:41:20 +090012182Return True if the string is an alpha-numeric string, False otherwise.
12183
12184A string is alpha-numeric if all characters in the string are alpha-numeric and
12185there is at least one character in the string.
12186[clinic start generated code]*/
12187
12188static PyObject *
12189unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012190/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012193 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 Py_ssize_t len, i;
12195
12196 if (PyUnicode_READY(self) == -1)
12197 return NULL;
12198
12199 kind = PyUnicode_KIND(self);
12200 data = PyUnicode_DATA(self);
12201 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012202
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012203 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 if (len == 1) {
12205 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12206 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12207 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012208
12209 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012211 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 for (i = 0; i < len; i++) {
12214 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012215 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012216 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012217 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012218 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012219}
12220
INADA Naoki3ae20562017-01-16 20:41:20 +090012221/*[clinic input]
12222str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
INADA Naoki3ae20562017-01-16 20:41:20 +090012224Return True if the string is a decimal string, False otherwise.
12225
12226A string is a decimal string if all characters in the string are decimal and
12227there is at least one character in the string.
12228[clinic start generated code]*/
12229
12230static PyObject *
12231unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012232/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 Py_ssize_t i, length;
12235 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012236 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237
12238 if (PyUnicode_READY(self) == -1)
12239 return NULL;
12240 length = PyUnicode_GET_LENGTH(self);
12241 kind = PyUnicode_KIND(self);
12242 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 if (length == 1)
12246 return PyBool_FromLong(
12247 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012251 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 for (i = 0; i < length; i++) {
12254 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012255 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012257 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258}
12259
INADA Naoki3ae20562017-01-16 20:41:20 +090012260/*[clinic input]
12261str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
INADA Naoki3ae20562017-01-16 20:41:20 +090012263Return True if the string is a digit string, False otherwise.
12264
12265A string is a digit string if all characters in the string are digits and there
12266is at least one character in the string.
12267[clinic start generated code]*/
12268
12269static PyObject *
12270unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012271/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 Py_ssize_t i, length;
12274 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012275 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276
12277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279 length = PyUnicode_GET_LENGTH(self);
12280 kind = PyUnicode_KIND(self);
12281 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 if (length == 1) {
12285 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12286 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012289 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012291 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 for (i = 0; i < length; i++) {
12294 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012295 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012297 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298}
12299
INADA Naoki3ae20562017-01-16 20:41:20 +090012300/*[clinic input]
12301str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
INADA Naoki3ae20562017-01-16 20:41:20 +090012303Return True if the string is a numeric string, False otherwise.
12304
12305A string is numeric if all characters in the string are numeric and there is at
12306least one character in the string.
12307[clinic start generated code]*/
12308
12309static PyObject *
12310unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012311/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 Py_ssize_t i, length;
12314 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012315 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316
12317 if (PyUnicode_READY(self) == -1)
12318 return NULL;
12319 length = PyUnicode_GET_LENGTH(self);
12320 kind = PyUnicode_KIND(self);
12321 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if (length == 1)
12325 return PyBool_FromLong(
12326 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012328 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012330 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 for (i = 0; i < length; i++) {
12333 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012334 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012336 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337}
12338
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012339Py_ssize_t
12340_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012343 if (PyUnicode_READY(self) == -1)
12344 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012345
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012346 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012347 if (len == 0) {
12348 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 }
12351
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012352 int kind = PyUnicode_KIND(self);
12353 const void *data = PyUnicode_DATA(self);
12354 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012355 /* PEP 3131 says that the first character must be in
12356 XID_Start and subsequent characters in XID_Continue,
12357 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012358 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012359 letters, digits, underscore). However, given the current
12360 definition of XID_Start and XID_Continue, it is sufficient
12361 to check just for these, except that _ must be allowed
12362 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012363 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012364 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012365 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012366
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012367 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012368 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012369 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012370 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012371 }
12372 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012373 return i;
12374}
12375
12376int
12377PyUnicode_IsIdentifier(PyObject *self)
12378{
12379 if (PyUnicode_IS_READY(self)) {
12380 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12381 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12382 /* an empty string is not a valid identifier */
12383 return len && i == len;
12384 }
12385 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012386_Py_COMP_DIAG_PUSH
12387_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012388 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012389 if (len == 0) {
12390 /* an empty string is not a valid identifier */
12391 return 0;
12392 }
12393
12394 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012395 Py_UCS4 ch = wstr[i++];
12396#if SIZEOF_WCHAR_T == 2
12397 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12398 && i < len
12399 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12400 {
12401 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12402 i++;
12403 }
12404#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012405 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12406 return 0;
12407 }
12408
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012409 while (i < len) {
12410 ch = wstr[i++];
12411#if SIZEOF_WCHAR_T == 2
12412 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12413 && i < len
12414 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12415 {
12416 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12417 i++;
12418 }
12419#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012420 if (!_PyUnicode_IsXidContinue(ch)) {
12421 return 0;
12422 }
12423 }
12424 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012425_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012426 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012427}
12428
INADA Naoki3ae20562017-01-16 20:41:20 +090012429/*[clinic input]
12430str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012431
INADA Naoki3ae20562017-01-16 20:41:20 +090012432Return True if the string is a valid Python identifier, False otherwise.
12433
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012434Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012435such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012436[clinic start generated code]*/
12437
12438static PyObject *
12439unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012440/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012441{
12442 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12443}
12444
INADA Naoki3ae20562017-01-16 20:41:20 +090012445/*[clinic input]
12446str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012447
INADA Naoki3ae20562017-01-16 20:41:20 +090012448Return True if the string is printable, False otherwise.
12449
12450A string is printable if all of its characters are considered printable in
12451repr() or if it is empty.
12452[clinic start generated code]*/
12453
12454static PyObject *
12455unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012456/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 Py_ssize_t i, length;
12459 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012460 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461
12462 if (PyUnicode_READY(self) == -1)
12463 return NULL;
12464 length = PyUnicode_GET_LENGTH(self);
12465 kind = PyUnicode_KIND(self);
12466 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012467
12468 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 if (length == 1)
12470 return PyBool_FromLong(
12471 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 for (i = 0; i < length; i++) {
12474 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012475 Py_RETURN_FALSE;
12476 }
12477 }
12478 Py_RETURN_TRUE;
12479}
12480
INADA Naoki3ae20562017-01-16 20:41:20 +090012481/*[clinic input]
12482str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483
INADA Naoki3ae20562017-01-16 20:41:20 +090012484 iterable: object
12485 /
12486
12487Concatenate any number of strings.
12488
Martin Panter91a88662017-01-24 00:30:06 +000012489The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012490The result is returned as a new string.
12491
12492Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12493[clinic start generated code]*/
12494
12495static PyObject *
12496unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012497/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498{
INADA Naoki3ae20562017-01-16 20:41:20 +090012499 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500}
12501
Martin v. Löwis18e16552006-02-15 17:27:45 +000012502static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012503unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 if (PyUnicode_READY(self) == -1)
12506 return -1;
12507 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508}
12509
INADA Naoki3ae20562017-01-16 20:41:20 +090012510/*[clinic input]
12511str.ljust as unicode_ljust
12512
12513 width: Py_ssize_t
12514 fillchar: Py_UCS4 = ' '
12515 /
12516
12517Return a left-justified string of length width.
12518
12519Padding is done using the specified fill character (default is a space).
12520[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
12522static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012523unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12524/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012526 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
Victor Stinnerc4b49542011-12-11 22:44:26 +010012529 if (PyUnicode_GET_LENGTH(self) >= width)
12530 return unicode_result_unchanged(self);
12531
12532 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533}
12534
INADA Naoki3ae20562017-01-16 20:41:20 +090012535/*[clinic input]
12536str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
INADA Naoki3ae20562017-01-16 20:41:20 +090012538Return a copy of the string converted to lowercase.
12539[clinic start generated code]*/
12540
12541static PyObject *
12542unicode_lower_impl(PyObject *self)
12543/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012545 if (PyUnicode_READY(self) == -1)
12546 return NULL;
12547 if (PyUnicode_IS_ASCII(self))
12548 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012549 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550}
12551
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012552#define LEFTSTRIP 0
12553#define RIGHTSTRIP 1
12554#define BOTHSTRIP 2
12555
12556/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012557static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012558
INADA Naoki3ae20562017-01-16 20:41:20 +090012559#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012560
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012561/* externally visible for str.strip(unicode) */
12562PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012563_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012564{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012565 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 int kind;
12567 Py_ssize_t i, j, len;
12568 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012569 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12572 return NULL;
12573
12574 kind = PyUnicode_KIND(self);
12575 data = PyUnicode_DATA(self);
12576 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012577 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12579 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012580 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012581
Benjamin Peterson14339b62009-01-31 16:36:08 +000012582 i = 0;
12583 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012584 while (i < len) {
12585 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12586 if (!BLOOM(sepmask, ch))
12587 break;
12588 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12589 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 i++;
12591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012592 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012593
Benjamin Peterson14339b62009-01-31 16:36:08 +000012594 j = len;
12595 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012596 j--;
12597 while (j >= i) {
12598 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12599 if (!BLOOM(sepmask, ch))
12600 break;
12601 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12602 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012604 }
12605
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012607 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012608
Victor Stinner7931d9a2011-11-04 00:22:48 +010012609 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610}
12611
12612PyObject*
12613PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12614{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012615 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012617 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618
Victor Stinnerde636f32011-10-01 03:55:54 +020012619 if (PyUnicode_READY(self) == -1)
12620 return NULL;
12621
Victor Stinner684d5fd2012-05-03 02:32:34 +020012622 length = PyUnicode_GET_LENGTH(self);
12623 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012624
Victor Stinner684d5fd2012-05-03 02:32:34 +020012625 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012626 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627
Victor Stinnerde636f32011-10-01 03:55:54 +020012628 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012629 PyErr_SetString(PyExc_IndexError, "string index out of range");
12630 return NULL;
12631 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012632 if (start >= length || end < start)
12633 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012634
Victor Stinner684d5fd2012-05-03 02:32:34 +020012635 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012636 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012637 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012638 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012639 }
12640 else {
12641 kind = PyUnicode_KIND(self);
12642 data = PyUnicode_1BYTE_DATA(self);
12643 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012644 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012645 length);
12646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
12649static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012650do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 Py_ssize_t len, i, j;
12653
12654 if (PyUnicode_READY(self) == -1)
12655 return NULL;
12656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012658
Victor Stinnercc7af722013-04-09 22:39:24 +020012659 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012660 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012661
12662 i = 0;
12663 if (striptype != RIGHTSTRIP) {
12664 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012665 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012666 if (!_Py_ascii_whitespace[ch])
12667 break;
12668 i++;
12669 }
12670 }
12671
12672 j = len;
12673 if (striptype != LEFTSTRIP) {
12674 j--;
12675 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012676 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012677 if (!_Py_ascii_whitespace[ch])
12678 break;
12679 j--;
12680 }
12681 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012682 }
12683 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012684 else {
12685 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012686 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012687
Victor Stinnercc7af722013-04-09 22:39:24 +020012688 i = 0;
12689 if (striptype != RIGHTSTRIP) {
12690 while (i < len) {
12691 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12692 if (!Py_UNICODE_ISSPACE(ch))
12693 break;
12694 i++;
12695 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012696 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012697
12698 j = len;
12699 if (striptype != LEFTSTRIP) {
12700 j--;
12701 while (j >= i) {
12702 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12703 if (!Py_UNICODE_ISSPACE(ch))
12704 break;
12705 j--;
12706 }
12707 j++;
12708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012709 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012710
Victor Stinner7931d9a2011-11-04 00:22:48 +010012711 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712}
12713
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012714
12715static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012716do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012717{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012718 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012719 if (PyUnicode_Check(sep))
12720 return _PyUnicode_XStrip(self, striptype, sep);
12721 else {
12722 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 "%s arg must be None or str",
12724 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012725 return NULL;
12726 }
12727 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012728
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012730}
12731
12732
INADA Naoki3ae20562017-01-16 20:41:20 +090012733/*[clinic input]
12734str.strip as unicode_strip
12735
12736 chars: object = None
12737 /
12738
Zachary Ware09895c22019-10-09 16:09:00 -050012739Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012740
12741If chars is given and not None, remove characters in chars instead.
12742[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012743
12744static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012745unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012746/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012747{
INADA Naoki3ae20562017-01-16 20:41:20 +090012748 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012749}
12750
12751
INADA Naoki3ae20562017-01-16 20:41:20 +090012752/*[clinic input]
12753str.lstrip as unicode_lstrip
12754
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012755 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012756 /
12757
12758Return a copy of the string with leading whitespace removed.
12759
12760If chars is given and not None, remove characters in chars instead.
12761[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012762
12763static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012764unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012765/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012766{
INADA Naoki3ae20562017-01-16 20:41:20 +090012767 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012768}
12769
12770
INADA Naoki3ae20562017-01-16 20:41:20 +090012771/*[clinic input]
12772str.rstrip as unicode_rstrip
12773
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012774 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012775 /
12776
12777Return a copy of the string with trailing whitespace removed.
12778
12779If chars is given and not None, remove characters in chars instead.
12780[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012781
12782static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012783unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012784/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012785{
INADA Naoki3ae20562017-01-16 20:41:20 +090012786 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012787}
12788
12789
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012791unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012793 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795
Serhiy Storchaka05997252013-01-26 12:14:02 +020012796 if (len < 1)
12797 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798
Victor Stinnerc4b49542011-12-11 22:44:26 +010012799 /* no repeat, return original string */
12800 if (len == 1)
12801 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012802
Benjamin Petersonbac79492012-01-14 13:34:47 -050012803 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 return NULL;
12805
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012806 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012807 PyErr_SetString(PyExc_OverflowError,
12808 "repeated string is too long");
12809 return NULL;
12810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012812
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012813 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814 if (!u)
12815 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012816 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012819 int kind = PyUnicode_KIND(str);
12820 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012821 if (kind == PyUnicode_1BYTE_KIND) {
12822 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012823 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012824 }
12825 else if (kind == PyUnicode_2BYTE_KIND) {
12826 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012827 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012828 ucs2[n] = fill_char;
12829 } else {
12830 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12831 assert(kind == PyUnicode_4BYTE_KIND);
12832 for (n = 0; n < len; ++n)
12833 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 }
12836 else {
12837 /* number of characters copied this far */
12838 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012839 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012841 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012845 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012846 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848 }
12849
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012850 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012851 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852}
12853
Alexander Belopolsky40018472011-02-26 01:02:56 +000012854PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012855PyUnicode_Replace(PyObject *str,
12856 PyObject *substr,
12857 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012858 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012860 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12861 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012863 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864}
12865
INADA Naoki3ae20562017-01-16 20:41:20 +090012866/*[clinic input]
12867str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868
INADA Naoki3ae20562017-01-16 20:41:20 +090012869 old: unicode
12870 new: unicode
12871 count: Py_ssize_t = -1
12872 Maximum number of occurrences to replace.
12873 -1 (the default value) means replace all occurrences.
12874 /
12875
12876Return a copy with all occurrences of substring old replaced by new.
12877
12878If the optional argument count is given, only the first count occurrences are
12879replaced.
12880[clinic start generated code]*/
12881
12882static PyObject *
12883unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12884 Py_ssize_t count)
12885/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012887 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012889 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890}
12891
sweeneydea81849b2020-04-22 17:05:48 -040012892/*[clinic input]
12893str.removeprefix as unicode_removeprefix
12894
12895 prefix: unicode
12896 /
12897
12898Return a str with the given prefix string removed if present.
12899
12900If the string starts with the prefix string, return string[len(prefix):].
12901Otherwise, return a copy of the original string.
12902[clinic start generated code]*/
12903
12904static PyObject *
12905unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12906/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12907{
12908 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12909 if (match == -1) {
12910 return NULL;
12911 }
12912 if (match) {
12913 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12914 PyUnicode_GET_LENGTH(self));
12915 }
12916 return unicode_result_unchanged(self);
12917}
12918
12919/*[clinic input]
12920str.removesuffix as unicode_removesuffix
12921
12922 suffix: unicode
12923 /
12924
12925Return a str with the given suffix string removed if present.
12926
12927If the string ends with the suffix string and that suffix is not empty,
12928return string[:-len(suffix)]. Otherwise, return a copy of the original
12929string.
12930[clinic start generated code]*/
12931
12932static PyObject *
12933unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12934/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12935{
12936 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12937 if (match == -1) {
12938 return NULL;
12939 }
12940 if (match) {
12941 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12942 - PyUnicode_GET_LENGTH(suffix));
12943 }
12944 return unicode_result_unchanged(self);
12945}
12946
Alexander Belopolsky40018472011-02-26 01:02:56 +000012947static PyObject *
12948unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012950 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 Py_ssize_t isize;
12952 Py_ssize_t osize, squote, dquote, i, o;
12953 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012954 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012955 const void *idata;
12956 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012959 return NULL;
12960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 isize = PyUnicode_GET_LENGTH(unicode);
12962 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 /* Compute length of output, quote characters, and
12965 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012966 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 max = 127;
12968 squote = dquote = 0;
12969 ikind = PyUnicode_KIND(unicode);
12970 for (i = 0; i < isize; i++) {
12971 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012972 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012974 case '\'': squote++; break;
12975 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012977 incr = 2;
12978 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 default:
12980 /* Fast-path ASCII */
12981 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012982 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012984 ;
12985 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012988 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012990 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012992 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012994 if (osize > PY_SSIZE_T_MAX - incr) {
12995 PyErr_SetString(PyExc_OverflowError,
12996 "string is too long to generate repr");
12997 return NULL;
12998 }
12999 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 }
13001
13002 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013003 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013005 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 if (dquote)
13007 /* Both squote and dquote present. Use squote,
13008 and escape them */
13009 osize += squote;
13010 else
13011 quote = '"';
13012 }
Victor Stinner55c08782013-04-14 18:45:39 +020013013 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014
13015 repr = PyUnicode_New(osize, max);
13016 if (repr == NULL)
13017 return NULL;
13018 okind = PyUnicode_KIND(repr);
13019 odata = PyUnicode_DATA(repr);
13020
13021 PyUnicode_WRITE(okind, odata, 0, quote);
13022 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013023 if (unchanged) {
13024 _PyUnicode_FastCopyCharacters(repr, 1,
13025 unicode, 0,
13026 isize);
13027 }
13028 else {
13029 for (i = 0, o = 1; i < isize; i++) {
13030 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031
Victor Stinner55c08782013-04-14 18:45:39 +020013032 /* Escape quotes and backslashes */
13033 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013034 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013036 continue;
13037 }
13038
13039 /* Map special whitespace to '\t', \n', '\r' */
13040 if (ch == '\t') {
13041 PyUnicode_WRITE(okind, odata, o++, '\\');
13042 PyUnicode_WRITE(okind, odata, o++, 't');
13043 }
13044 else if (ch == '\n') {
13045 PyUnicode_WRITE(okind, odata, o++, '\\');
13046 PyUnicode_WRITE(okind, odata, o++, 'n');
13047 }
13048 else if (ch == '\r') {
13049 PyUnicode_WRITE(okind, odata, o++, '\\');
13050 PyUnicode_WRITE(okind, odata, o++, 'r');
13051 }
13052
13053 /* Map non-printable US ASCII to '\xhh' */
13054 else if (ch < ' ' || ch == 0x7F) {
13055 PyUnicode_WRITE(okind, odata, o++, '\\');
13056 PyUnicode_WRITE(okind, odata, o++, 'x');
13057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13058 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13059 }
13060
13061 /* Copy ASCII characters as-is */
13062 else if (ch < 0x7F) {
13063 PyUnicode_WRITE(okind, odata, o++, ch);
13064 }
13065
13066 /* Non-ASCII characters */
13067 else {
13068 /* Map Unicode whitespace and control characters
13069 (categories Z* and C* except ASCII space)
13070 */
13071 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13072 PyUnicode_WRITE(okind, odata, o++, '\\');
13073 /* Map 8-bit characters to '\xhh' */
13074 if (ch <= 0xff) {
13075 PyUnicode_WRITE(okind, odata, o++, 'x');
13076 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13077 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13078 }
13079 /* Map 16-bit characters to '\uxxxx' */
13080 else if (ch <= 0xffff) {
13081 PyUnicode_WRITE(okind, odata, o++, 'u');
13082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13085 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13086 }
13087 /* Map 21-bit characters to '\U00xxxxxx' */
13088 else {
13089 PyUnicode_WRITE(okind, odata, o++, 'U');
13090 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13093 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13096 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13097 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13098 }
13099 }
13100 /* Copy characters as-is */
13101 else {
13102 PyUnicode_WRITE(okind, odata, o++, ch);
13103 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013104 }
13105 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013108 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013109 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110}
13111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013112PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114\n\
13115Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013116such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117arguments start and end are interpreted as in slice notation.\n\
13118\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013119Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
13121static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013124 /* initialize variables to prevent gcc warning */
13125 PyObject *substring = NULL;
13126 Py_ssize_t start = 0;
13127 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013128 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013130 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013133 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013135
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013136 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 if (result == -2)
13139 return NULL;
13140
Christian Heimes217cfd12007-12-02 14:31:20 +000013141 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142}
13143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013144PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013145 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013147Return the highest index in S where substring sub is found,\n\
13148such that sub is contained within S[start:end]. Optional\n\
13149arguments start and end are interpreted as in slice notation.\n\
13150\n\
13151Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152
13153static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013156 /* initialize variables to prevent gcc warning */
13157 PyObject *substring = NULL;
13158 Py_ssize_t start = 0;
13159 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013160 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013162 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013165 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013168 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170 if (result == -2)
13171 return NULL;
13172
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173 if (result < 0) {
13174 PyErr_SetString(PyExc_ValueError, "substring not found");
13175 return NULL;
13176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177
Christian Heimes217cfd12007-12-02 14:31:20 +000013178 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179}
13180
INADA Naoki3ae20562017-01-16 20:41:20 +090013181/*[clinic input]
13182str.rjust as unicode_rjust
13183
13184 width: Py_ssize_t
13185 fillchar: Py_UCS4 = ' '
13186 /
13187
13188Return a right-justified string of length width.
13189
13190Padding is done using the specified fill character (default is a space).
13191[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192
13193static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013194unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13195/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013197 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198 return NULL;
13199
Victor Stinnerc4b49542011-12-11 22:44:26 +010013200 if (PyUnicode_GET_LENGTH(self) >= width)
13201 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202
Victor Stinnerc4b49542011-12-11 22:44:26 +010013203 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204}
13205
Alexander Belopolsky40018472011-02-26 01:02:56 +000013206PyObject *
13207PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013209 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013212 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213}
13214
INADA Naoki3ae20562017-01-16 20:41:20 +090013215/*[clinic input]
13216str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217
INADA Naoki3ae20562017-01-16 20:41:20 +090013218 sep: object = None
13219 The delimiter according which to split the string.
13220 None (the default value) means split according to any whitespace,
13221 and discard empty strings from the result.
13222 maxsplit: Py_ssize_t = -1
13223 Maximum number of splits to do.
13224 -1 (the default value) means no limit.
13225
13226Return a list of the words in the string, using sep as the delimiter string.
13227[clinic start generated code]*/
13228
13229static PyObject *
13230unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13231/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232{
INADA Naoki3ae20562017-01-16 20:41:20 +090013233 if (sep == Py_None)
13234 return split(self, NULL, maxsplit);
13235 if (PyUnicode_Check(sep))
13236 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013237
Victor Stinner998b8062018-09-12 00:23:25 +020013238 PyErr_Format(PyExc_TypeError,
13239 "must be str or None, not %.100s",
13240 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242}
13243
Thomas Wouters477c8d52006-05-27 19:21:47 +000013244PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013245PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013246{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013247 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013248 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013249 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013251
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013252 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013253 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013254
Victor Stinner14f8f022011-10-05 20:58:25 +020013255 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 len1 = PyUnicode_GET_LENGTH(str_obj);
13258 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013259 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013260 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013261 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013262 }
13263 buf1 = PyUnicode_DATA(str_obj);
13264 buf2 = PyUnicode_DATA(sep_obj);
13265 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013266 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013267 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013268 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013271 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013273 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13274 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13275 else
13276 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 break;
13278 case PyUnicode_2BYTE_KIND:
13279 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13280 break;
13281 case PyUnicode_4BYTE_KIND:
13282 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13283 break;
13284 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013285 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013287
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013288 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013289 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013290 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013291
13292 return out;
13293}
13294
13295
13296PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013297PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013298{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013299 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013300 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013301 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013303
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013304 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013306
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013307 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 len1 = PyUnicode_GET_LENGTH(str_obj);
13310 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013311 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013312 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013313 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013314 }
13315 buf1 = PyUnicode_DATA(str_obj);
13316 buf2 = PyUnicode_DATA(sep_obj);
13317 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013318 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013319 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013320 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013323 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013324 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013325 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13326 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13327 else
13328 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 break;
13330 case PyUnicode_2BYTE_KIND:
13331 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13332 break;
13333 case PyUnicode_4BYTE_KIND:
13334 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13335 break;
13336 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013337 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013339
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013340 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013341 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013342 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013343
13344 return out;
13345}
13346
INADA Naoki3ae20562017-01-16 20:41:20 +090013347/*[clinic input]
13348str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013349
INADA Naoki3ae20562017-01-16 20:41:20 +090013350 sep: object
13351 /
13352
13353Partition the string into three parts using the given separator.
13354
13355This will search for the separator in the string. If the separator is found,
13356returns a 3-tuple containing the part before the separator, the separator
13357itself, and the part after it.
13358
13359If the separator is not found, returns a 3-tuple containing the original string
13360and two empty strings.
13361[clinic start generated code]*/
13362
13363static PyObject *
13364unicode_partition(PyObject *self, PyObject *sep)
13365/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013366{
INADA Naoki3ae20562017-01-16 20:41:20 +090013367 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013368}
13369
INADA Naoki3ae20562017-01-16 20:41:20 +090013370/*[clinic input]
13371str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013372
INADA Naoki3ae20562017-01-16 20:41:20 +090013373Partition the string into three parts using the given separator.
13374
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013375This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013376the separator is found, returns a 3-tuple containing the part before the
13377separator, the separator itself, and the part after it.
13378
13379If the separator is not found, returns a 3-tuple containing two empty strings
13380and the original string.
13381[clinic start generated code]*/
13382
13383static PyObject *
13384unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013385/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013386{
INADA Naoki3ae20562017-01-16 20:41:20 +090013387 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013388}
13389
Alexander Belopolsky40018472011-02-26 01:02:56 +000013390PyObject *
13391PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013392{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013393 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013394 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013395
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013397}
13398
INADA Naoki3ae20562017-01-16 20:41:20 +090013399/*[clinic input]
13400str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013401
INADA Naoki3ae20562017-01-16 20:41:20 +090013402Return a list of the words in the string, using sep as the delimiter string.
13403
13404Splits are done starting at the end of the string and working to the front.
13405[clinic start generated code]*/
13406
13407static PyObject *
13408unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13409/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013410{
INADA Naoki3ae20562017-01-16 20:41:20 +090013411 if (sep == Py_None)
13412 return rsplit(self, NULL, maxsplit);
13413 if (PyUnicode_Check(sep))
13414 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013415
Victor Stinner998b8062018-09-12 00:23:25 +020013416 PyErr_Format(PyExc_TypeError,
13417 "must be str or None, not %.100s",
13418 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013419 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013420}
13421
INADA Naoki3ae20562017-01-16 20:41:20 +090013422/*[clinic input]
13423str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013425 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013426
13427Return a list of the lines in the string, breaking at line boundaries.
13428
13429Line breaks are not included in the resulting list unless keepends is given and
13430true.
13431[clinic start generated code]*/
13432
13433static PyObject *
13434unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013435/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013437 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438}
13439
13440static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013441PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013443 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444}
13445
INADA Naoki3ae20562017-01-16 20:41:20 +090013446/*[clinic input]
13447str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448
INADA Naoki3ae20562017-01-16 20:41:20 +090013449Convert uppercase characters to lowercase and lowercase characters to uppercase.
13450[clinic start generated code]*/
13451
13452static PyObject *
13453unicode_swapcase_impl(PyObject *self)
13454/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013455{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013456 if (PyUnicode_READY(self) == -1)
13457 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013458 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459}
13460
Larry Hastings61272b72014-01-07 12:41:53 -080013461/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013462
Larry Hastings31826802013-10-19 00:09:25 -070013463@staticmethod
13464str.maketrans as unicode_maketrans
13465
13466 x: object
13467
13468 y: unicode=NULL
13469
13470 z: unicode=NULL
13471
13472 /
13473
13474Return a translation table usable for str.translate().
13475
13476If there is only one argument, it must be a dictionary mapping Unicode
13477ordinals (integers) or characters to Unicode ordinals, strings or None.
13478Character keys will be then converted to ordinals.
13479If there are two arguments, they must be strings of equal length, and
13480in the resulting dictionary, each character in x will be mapped to the
13481character at the same position in y. If there is a third argument, it
13482must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013483[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013484
Larry Hastings31826802013-10-19 00:09:25 -070013485static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013486unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013487/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013488{
Georg Brandlceee0772007-11-27 23:48:05 +000013489 PyObject *new = NULL, *key, *value;
13490 Py_ssize_t i = 0;
13491 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013492
Georg Brandlceee0772007-11-27 23:48:05 +000013493 new = PyDict_New();
13494 if (!new)
13495 return NULL;
13496 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013497 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013498 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013499
Georg Brandlceee0772007-11-27 23:48:05 +000013500 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013501 if (!PyUnicode_Check(x)) {
13502 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13503 "be a string if there is a second argument");
13504 goto err;
13505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013506 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013507 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13508 "arguments must have equal length");
13509 goto err;
13510 }
13511 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013512 x_kind = PyUnicode_KIND(x);
13513 y_kind = PyUnicode_KIND(y);
13514 x_data = PyUnicode_DATA(x);
13515 y_data = PyUnicode_DATA(y);
13516 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13517 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013518 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013519 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013520 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013521 if (!value) {
13522 Py_DECREF(key);
13523 goto err;
13524 }
Georg Brandlceee0772007-11-27 23:48:05 +000013525 res = PyDict_SetItem(new, key, value);
13526 Py_DECREF(key);
13527 Py_DECREF(value);
13528 if (res < 0)
13529 goto err;
13530 }
13531 /* create entries for deleting chars in z */
13532 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013533 z_kind = PyUnicode_KIND(z);
13534 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013535 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013536 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013537 if (!key)
13538 goto err;
13539 res = PyDict_SetItem(new, key, Py_None);
13540 Py_DECREF(key);
13541 if (res < 0)
13542 goto err;
13543 }
13544 }
13545 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013546 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013547 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013548
Georg Brandlceee0772007-11-27 23:48:05 +000013549 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013550 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013551 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13552 "to maketrans it must be a dict");
13553 goto err;
13554 }
13555 /* copy entries into the new dict, converting string keys to int keys */
13556 while (PyDict_Next(x, &i, &key, &value)) {
13557 if (PyUnicode_Check(key)) {
13558 /* convert string keys to integer keys */
13559 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013560 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013561 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13562 "table must be of length 1");
13563 goto err;
13564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013565 kind = PyUnicode_KIND(key);
13566 data = PyUnicode_DATA(key);
13567 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013568 if (!newkey)
13569 goto err;
13570 res = PyDict_SetItem(new, newkey, value);
13571 Py_DECREF(newkey);
13572 if (res < 0)
13573 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013574 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013575 /* just keep integer keys */
13576 if (PyDict_SetItem(new, key, value) < 0)
13577 goto err;
13578 } else {
13579 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13580 "be strings or integers");
13581 goto err;
13582 }
13583 }
13584 }
13585 return new;
13586 err:
13587 Py_DECREF(new);
13588 return NULL;
13589}
13590
INADA Naoki3ae20562017-01-16 20:41:20 +090013591/*[clinic input]
13592str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593
INADA Naoki3ae20562017-01-16 20:41:20 +090013594 table: object
13595 Translation table, which must be a mapping of Unicode ordinals to
13596 Unicode ordinals, strings, or None.
13597 /
13598
13599Replace each character in the string using the given translation table.
13600
13601The table must implement lookup/indexing via __getitem__, for instance a
13602dictionary or list. If this operation raises LookupError, the character is
13603left untouched. Characters mapped to None are deleted.
13604[clinic start generated code]*/
13605
13606static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013608/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013611}
13612
INADA Naoki3ae20562017-01-16 20:41:20 +090013613/*[clinic input]
13614str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013615
INADA Naoki3ae20562017-01-16 20:41:20 +090013616Return a copy of the string converted to uppercase.
13617[clinic start generated code]*/
13618
13619static PyObject *
13620unicode_upper_impl(PyObject *self)
13621/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013622{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013623 if (PyUnicode_READY(self) == -1)
13624 return NULL;
13625 if (PyUnicode_IS_ASCII(self))
13626 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013627 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013628}
13629
INADA Naoki3ae20562017-01-16 20:41:20 +090013630/*[clinic input]
13631str.zfill as unicode_zfill
13632
13633 width: Py_ssize_t
13634 /
13635
13636Pad a numeric string with zeros on the left, to fill a field of the given width.
13637
13638The string is never truncated.
13639[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013640
13641static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013642unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013643/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013644{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013645 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013646 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013648 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013649 Py_UCS4 chr;
13650
Benjamin Petersonbac79492012-01-14 13:34:47 -050013651 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013653
Victor Stinnerc4b49542011-12-11 22:44:26 +010013654 if (PyUnicode_GET_LENGTH(self) >= width)
13655 return unicode_result_unchanged(self);
13656
13657 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013658
13659 u = pad(self, fill, 0, '0');
13660
Walter Dörwald068325e2002-04-15 13:36:47 +000013661 if (u == NULL)
13662 return NULL;
13663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013664 kind = PyUnicode_KIND(u);
13665 data = PyUnicode_DATA(u);
13666 chr = PyUnicode_READ(kind, data, fill);
13667
13668 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013669 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 PyUnicode_WRITE(kind, data, 0, chr);
13671 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013672 }
13673
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013674 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013675 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013676}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677
13678#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013679static PyObject *
13680unicode__decimal2ascii(PyObject *self)
13681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013682 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013683}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684#endif
13685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013686PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013687 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013688\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013689Return True if S starts with the specified prefix, False otherwise.\n\
13690With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013691With optional end, stop comparing S at that position.\n\
13692prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693
13694static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013695unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013696 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013697{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013698 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013699 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013700 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013701 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013702 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013703
Jesus Ceaac451502011-04-20 17:09:23 +020013704 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013706 if (PyTuple_Check(subobj)) {
13707 Py_ssize_t i;
13708 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013709 substring = PyTuple_GET_ITEM(subobj, i);
13710 if (!PyUnicode_Check(substring)) {
13711 PyErr_Format(PyExc_TypeError,
13712 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013713 "not %.100s",
13714 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013715 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013716 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013717 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013718 if (result == -1)
13719 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013720 if (result) {
13721 Py_RETURN_TRUE;
13722 }
13723 }
13724 /* nothing matched */
13725 Py_RETURN_FALSE;
13726 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013727 if (!PyUnicode_Check(subobj)) {
13728 PyErr_Format(PyExc_TypeError,
13729 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013730 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013732 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013733 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013734 if (result == -1)
13735 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013736 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013737}
13738
13739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013740PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013742\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013743Return True if S ends with the specified suffix, False otherwise.\n\
13744With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013745With optional end, stop comparing S at that position.\n\
13746suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747
13748static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013749unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013750 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013751{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013752 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013753 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013754 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013755 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013756 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757
Jesus Ceaac451502011-04-20 17:09:23 +020013758 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013760 if (PyTuple_Check(subobj)) {
13761 Py_ssize_t i;
13762 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013763 substring = PyTuple_GET_ITEM(subobj, i);
13764 if (!PyUnicode_Check(substring)) {
13765 PyErr_Format(PyExc_TypeError,
13766 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013767 "not %.100s",
13768 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013770 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013771 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013772 if (result == -1)
13773 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013774 if (result) {
13775 Py_RETURN_TRUE;
13776 }
13777 }
13778 Py_RETURN_FALSE;
13779 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013780 if (!PyUnicode_Check(subobj)) {
13781 PyErr_Format(PyExc_TypeError,
13782 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013783 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013784 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013785 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013786 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013787 if (result == -1)
13788 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013789 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013790}
13791
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013792static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013793_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013794{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013795 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13796 writer->data = PyUnicode_DATA(writer->buffer);
13797
13798 if (!writer->readonly) {
13799 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013800 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013801 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013802 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013803 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13804 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13805 writer->kind = PyUnicode_WCHAR_KIND;
13806 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13807
Victor Stinner8f674cc2013-04-17 23:02:17 +020013808 /* Copy-on-write mode: set buffer size to 0 so
13809 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13810 * next write. */
13811 writer->size = 0;
13812 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013813}
13814
Victor Stinnerd3f08822012-05-29 12:57:52 +020013815void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013816_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013817{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013818 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013819
13820 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013821 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013822
13823 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13824 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13825 writer->kind = PyUnicode_WCHAR_KIND;
13826 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013827}
13828
Inada Naoki770847a2019-06-24 12:30:24 +090013829// Initialize _PyUnicodeWriter with initial buffer
13830static inline void
13831_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13832{
13833 memset(writer, 0, sizeof(*writer));
13834 writer->buffer = buffer;
13835 _PyUnicodeWriter_Update(writer);
13836 writer->min_length = writer->size;
13837}
13838
Victor Stinnerd3f08822012-05-29 12:57:52 +020013839int
13840_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13841 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013842{
13843 Py_ssize_t newlen;
13844 PyObject *newbuffer;
13845
Victor Stinner2740e462016-09-06 16:58:36 -070013846 assert(maxchar <= MAX_UNICODE);
13847
Victor Stinnerca9381e2015-09-22 00:58:32 +020013848 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013849 assert((maxchar > writer->maxchar && length >= 0)
13850 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013851
Victor Stinner202fdca2012-05-07 12:47:02 +020013852 if (length > PY_SSIZE_T_MAX - writer->pos) {
13853 PyErr_NoMemory();
13854 return -1;
13855 }
13856 newlen = writer->pos + length;
13857
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013858 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013859
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013861 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013862 if (writer->overallocate
13863 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13864 /* overallocate to limit the number of realloc() */
13865 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013866 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013867 if (newlen < writer->min_length)
13868 newlen = writer->min_length;
13869
Victor Stinnerd3f08822012-05-29 12:57:52 +020013870 writer->buffer = PyUnicode_New(newlen, maxchar);
13871 if (writer->buffer == NULL)
13872 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013874 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013875 if (writer->overallocate
13876 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13877 /* overallocate to limit the number of realloc() */
13878 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013880 if (newlen < writer->min_length)
13881 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013882
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013883 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013884 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013885 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013886 newbuffer = PyUnicode_New(newlen, maxchar);
13887 if (newbuffer == NULL)
13888 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013889 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13890 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013891 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013892 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013893 }
13894 else {
13895 newbuffer = resize_compact(writer->buffer, newlen);
13896 if (newbuffer == NULL)
13897 return -1;
13898 }
13899 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013900 }
13901 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013902 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013903 newbuffer = PyUnicode_New(writer->size, maxchar);
13904 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013905 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013906 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13907 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013908 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013909 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013910 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013911 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013912
13913#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013914}
13915
Victor Stinnerca9381e2015-09-22 00:58:32 +020013916int
13917_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13918 enum PyUnicode_Kind kind)
13919{
13920 Py_UCS4 maxchar;
13921
13922 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13923 assert(writer->kind < kind);
13924
13925 switch (kind)
13926 {
13927 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13928 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13929 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13930 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013931 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013932 }
13933
13934 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13935}
13936
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013937static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013938_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013939{
Victor Stinner2740e462016-09-06 16:58:36 -070013940 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013941 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13942 return -1;
13943 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13944 writer->pos++;
13945 return 0;
13946}
13947
13948int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013949_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13950{
13951 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13952}
13953
13954int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013955_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13956{
13957 Py_UCS4 maxchar;
13958 Py_ssize_t len;
13959
13960 if (PyUnicode_READY(str) == -1)
13961 return -1;
13962 len = PyUnicode_GET_LENGTH(str);
13963 if (len == 0)
13964 return 0;
13965 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13966 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013967 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013968 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013969 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013970 Py_INCREF(str);
13971 writer->buffer = str;
13972 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013973 writer->pos += len;
13974 return 0;
13975 }
13976 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13977 return -1;
13978 }
13979 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13980 str, 0, len);
13981 writer->pos += len;
13982 return 0;
13983}
13984
Victor Stinnere215d962012-10-06 23:03:36 +020013985int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013986_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13987 Py_ssize_t start, Py_ssize_t end)
13988{
13989 Py_UCS4 maxchar;
13990 Py_ssize_t len;
13991
13992 if (PyUnicode_READY(str) == -1)
13993 return -1;
13994
13995 assert(0 <= start);
13996 assert(end <= PyUnicode_GET_LENGTH(str));
13997 assert(start <= end);
13998
13999 if (end == 0)
14000 return 0;
14001
14002 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14003 return _PyUnicodeWriter_WriteStr(writer, str);
14004
14005 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14006 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14007 else
14008 maxchar = writer->maxchar;
14009 len = end - start;
14010
14011 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14012 return -1;
14013
14014 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14015 str, start, len);
14016 writer->pos += len;
14017 return 0;
14018}
14019
14020int
Victor Stinner4a587072013-11-19 12:54:53 +010014021_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14022 const char *ascii, Py_ssize_t len)
14023{
14024 if (len == -1)
14025 len = strlen(ascii);
14026
Andy Lestere6be9b52020-02-11 20:28:35 -060014027 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014028
14029 if (writer->buffer == NULL && !writer->overallocate) {
14030 PyObject *str;
14031
14032 str = _PyUnicode_FromASCII(ascii, len);
14033 if (str == NULL)
14034 return -1;
14035
14036 writer->readonly = 1;
14037 writer->buffer = str;
14038 _PyUnicodeWriter_Update(writer);
14039 writer->pos += len;
14040 return 0;
14041 }
14042
14043 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14044 return -1;
14045
14046 switch (writer->kind)
14047 {
14048 case PyUnicode_1BYTE_KIND:
14049 {
14050 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14051 Py_UCS1 *data = writer->data;
14052
Christian Heimesf051e432016-09-13 20:22:02 +020014053 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014054 break;
14055 }
14056 case PyUnicode_2BYTE_KIND:
14057 {
14058 _PyUnicode_CONVERT_BYTES(
14059 Py_UCS1, Py_UCS2,
14060 ascii, ascii + len,
14061 (Py_UCS2 *)writer->data + writer->pos);
14062 break;
14063 }
14064 case PyUnicode_4BYTE_KIND:
14065 {
14066 _PyUnicode_CONVERT_BYTES(
14067 Py_UCS1, Py_UCS4,
14068 ascii, ascii + len,
14069 (Py_UCS4 *)writer->data + writer->pos);
14070 break;
14071 }
14072 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014073 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014074 }
14075
14076 writer->pos += len;
14077 return 0;
14078}
14079
14080int
14081_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14082 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014083{
14084 Py_UCS4 maxchar;
14085
Andy Lestere6be9b52020-02-11 20:28:35 -060014086 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014087 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14088 return -1;
14089 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14090 writer->pos += len;
14091 return 0;
14092}
14093
Victor Stinnerd3f08822012-05-29 12:57:52 +020014094PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014095_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014096{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014097 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014098
Victor Stinnerd3f08822012-05-29 12:57:52 +020014099 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014100 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014101 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014102 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014103
14104 str = writer->buffer;
14105 writer->buffer = NULL;
14106
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014107 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014108 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14109 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014110 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014111
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014112 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14113 PyObject *str2;
14114 str2 = resize_compact(str, writer->pos);
14115 if (str2 == NULL) {
14116 Py_DECREF(str);
14117 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014118 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014119 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014120 }
14121
Victor Stinner15a0bd32013-07-08 22:29:55 +020014122 assert(_PyUnicode_CheckConsistency(str, 1));
14123 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014124}
14125
Victor Stinnerd3f08822012-05-29 12:57:52 +020014126void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014127_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014128{
14129 Py_CLEAR(writer->buffer);
14130}
14131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014132#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014133
14134PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014135 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014136\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014137Return a formatted version of S, using substitutions from args and kwargs.\n\
14138The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014139
Eric Smith27bbca62010-11-04 17:06:58 +000014140PyDoc_STRVAR(format_map__doc__,
14141 "S.format_map(mapping) -> str\n\
14142\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014143Return a formatted version of S, using substitutions from mapping.\n\
14144The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014145
INADA Naoki3ae20562017-01-16 20:41:20 +090014146/*[clinic input]
14147str.__format__ as unicode___format__
14148
14149 format_spec: unicode
14150 /
14151
14152Return a formatted version of the string as described by format_spec.
14153[clinic start generated code]*/
14154
Eric Smith4a7d76d2008-05-30 18:10:19 +000014155static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014156unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014157/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014158{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014159 _PyUnicodeWriter writer;
14160 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014161
Victor Stinnerd3f08822012-05-29 12:57:52 +020014162 if (PyUnicode_READY(self) == -1)
14163 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014164 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014165 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14166 self, format_spec, 0,
14167 PyUnicode_GET_LENGTH(format_spec));
14168 if (ret == -1) {
14169 _PyUnicodeWriter_Dealloc(&writer);
14170 return NULL;
14171 }
14172 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014173}
14174
INADA Naoki3ae20562017-01-16 20:41:20 +090014175/*[clinic input]
14176str.__sizeof__ as unicode_sizeof
14177
14178Return the size of the string in memory, in bytes.
14179[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014180
14181static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014182unicode_sizeof_impl(PyObject *self)
14183/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014185 Py_ssize_t size;
14186
14187 /* If it's a compact object, account for base structure +
14188 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014189 if (PyUnicode_IS_COMPACT_ASCII(self))
14190 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14191 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014193 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014194 else {
14195 /* If it is a two-block object, account for base object, and
14196 for character block if present. */
14197 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014198 if (_PyUnicode_DATA_ANY(self))
14199 size += (PyUnicode_GET_LENGTH(self) + 1) *
14200 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014201 }
14202 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014203 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014204 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14205 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14206 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14207 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014208
14209 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014210}
14211
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014212static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014213unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014214{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014215 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014216 if (!copy)
14217 return NULL;
14218 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014219}
14220
Guido van Rossumd57fd912000-03-10 22:53:23 +000014221static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014222 UNICODE_ENCODE_METHODDEF
14223 UNICODE_REPLACE_METHODDEF
14224 UNICODE_SPLIT_METHODDEF
14225 UNICODE_RSPLIT_METHODDEF
14226 UNICODE_JOIN_METHODDEF
14227 UNICODE_CAPITALIZE_METHODDEF
14228 UNICODE_CASEFOLD_METHODDEF
14229 UNICODE_TITLE_METHODDEF
14230 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014231 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014232 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014233 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014234 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014235 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014236 UNICODE_LJUST_METHODDEF
14237 UNICODE_LOWER_METHODDEF
14238 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014239 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14240 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014241 UNICODE_RJUST_METHODDEF
14242 UNICODE_RSTRIP_METHODDEF
14243 UNICODE_RPARTITION_METHODDEF
14244 UNICODE_SPLITLINES_METHODDEF
14245 UNICODE_STRIP_METHODDEF
14246 UNICODE_SWAPCASE_METHODDEF
14247 UNICODE_TRANSLATE_METHODDEF
14248 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014249 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14250 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014251 UNICODE_REMOVEPREFIX_METHODDEF
14252 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014253 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014254 UNICODE_ISLOWER_METHODDEF
14255 UNICODE_ISUPPER_METHODDEF
14256 UNICODE_ISTITLE_METHODDEF
14257 UNICODE_ISSPACE_METHODDEF
14258 UNICODE_ISDECIMAL_METHODDEF
14259 UNICODE_ISDIGIT_METHODDEF
14260 UNICODE_ISNUMERIC_METHODDEF
14261 UNICODE_ISALPHA_METHODDEF
14262 UNICODE_ISALNUM_METHODDEF
14263 UNICODE_ISIDENTIFIER_METHODDEF
14264 UNICODE_ISPRINTABLE_METHODDEF
14265 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014266 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014267 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014268 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014269 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014270 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014271#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014272 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014273 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014274#endif
14275
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014276 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014277 {NULL, NULL}
14278};
14279
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014280static PyObject *
14281unicode_mod(PyObject *v, PyObject *w)
14282{
Brian Curtindfc80e32011-08-10 20:28:54 -050014283 if (!PyUnicode_Check(v))
14284 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014285 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014286}
14287
14288static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014289 0, /*nb_add*/
14290 0, /*nb_subtract*/
14291 0, /*nb_multiply*/
14292 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014293};
14294
Guido van Rossumd57fd912000-03-10 22:53:23 +000014295static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014296 (lenfunc) unicode_length, /* sq_length */
14297 PyUnicode_Concat, /* sq_concat */
14298 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14299 (ssizeargfunc) unicode_getitem, /* sq_item */
14300 0, /* sq_slice */
14301 0, /* sq_ass_item */
14302 0, /* sq_ass_slice */
14303 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014304};
14305
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014306static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014307unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014309 if (PyUnicode_READY(self) == -1)
14310 return NULL;
14311
Victor Stinnera15e2602020-04-08 02:01:56 +020014312 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014313 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014314 if (i == -1 && PyErr_Occurred())
14315 return NULL;
14316 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014317 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014318 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014319 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014320 Py_ssize_t start, stop, step, slicelength, i;
14321 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014322 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014323 const void *src_data;
14324 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014325 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014326 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014327
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014328 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014329 return NULL;
14330 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014331 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14332 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014333
14334 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014335 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014336 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014337 slicelength == PyUnicode_GET_LENGTH(self)) {
14338 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014339 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014340 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014341 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014342 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014343 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014344 src_kind = PyUnicode_KIND(self);
14345 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014346 if (!PyUnicode_IS_ASCII(self)) {
14347 kind_limit = kind_maxchar_limit(src_kind);
14348 max_char = 0;
14349 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14350 ch = PyUnicode_READ(src_kind, src_data, cur);
14351 if (ch > max_char) {
14352 max_char = ch;
14353 if (max_char >= kind_limit)
14354 break;
14355 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014356 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014357 }
Victor Stinner55c99112011-10-13 01:17:06 +020014358 else
14359 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014360 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014361 if (result == NULL)
14362 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014363 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014364 dest_data = PyUnicode_DATA(result);
14365
14366 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014367 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14368 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014370 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014371 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014372 } else {
14373 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14374 return NULL;
14375 }
14376}
14377
14378static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014379 (lenfunc)unicode_length, /* mp_length */
14380 (binaryfunc)unicode_subscript, /* mp_subscript */
14381 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014382};
14383
Guido van Rossumd57fd912000-03-10 22:53:23 +000014384
Guido van Rossumd57fd912000-03-10 22:53:23 +000014385/* Helpers for PyUnicode_Format() */
14386
Victor Stinnera47082312012-10-04 02:19:54 +020014387struct unicode_formatter_t {
14388 PyObject *args;
14389 int args_owned;
14390 Py_ssize_t arglen, argidx;
14391 PyObject *dict;
14392
14393 enum PyUnicode_Kind fmtkind;
14394 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014395 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014396 PyObject *fmtstr;
14397
14398 _PyUnicodeWriter writer;
14399};
14400
14401struct unicode_format_arg_t {
14402 Py_UCS4 ch;
14403 int flags;
14404 Py_ssize_t width;
14405 int prec;
14406 int sign;
14407};
14408
Guido van Rossumd57fd912000-03-10 22:53:23 +000014409static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014410unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014411{
Victor Stinnera47082312012-10-04 02:19:54 +020014412 Py_ssize_t argidx = ctx->argidx;
14413
14414 if (argidx < ctx->arglen) {
14415 ctx->argidx++;
14416 if (ctx->arglen < 0)
14417 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014418 else
Victor Stinnera47082312012-10-04 02:19:54 +020014419 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014420 }
14421 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014422 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014423 return NULL;
14424}
14425
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014426/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014427
Victor Stinnera47082312012-10-04 02:19:54 +020014428/* Format a float into the writer if the writer is not NULL, or into *p_output
14429 otherwise.
14430
14431 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014432static int
Victor Stinnera47082312012-10-04 02:19:54 +020014433formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14434 PyObject **p_output,
14435 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014436{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014437 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014438 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014439 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014440 int prec;
14441 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014442
Guido van Rossumd57fd912000-03-10 22:53:23 +000014443 x = PyFloat_AsDouble(v);
14444 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014445 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014446
Victor Stinnera47082312012-10-04 02:19:54 +020014447 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014448 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014449 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014450
Victor Stinnera47082312012-10-04 02:19:54 +020014451 if (arg->flags & F_ALT)
14452 dtoa_flags = Py_DTSF_ALT;
14453 else
14454 dtoa_flags = 0;
14455 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014456 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014457 return -1;
14458 len = strlen(p);
14459 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014460 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014461 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014462 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014463 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014464 }
14465 else
14466 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014467 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014468 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014469}
14470
Victor Stinnerd0880d52012-04-27 23:40:13 +020014471/* formatlong() emulates the format codes d, u, o, x and X, and
14472 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14473 * Python's regular ints.
14474 * Return value: a new PyUnicodeObject*, or NULL if error.
14475 * The output string is of the form
14476 * "-"? ("0x" | "0X")? digit+
14477 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14478 * set in flags. The case of hex digits will be correct,
14479 * There will be at least prec digits, zero-filled on the left if
14480 * necessary to get that many.
14481 * val object to be converted
14482 * flags bitmask of format flags; only F_ALT is looked at
14483 * prec minimum number of digits; 0-fill on left if needed
14484 * type a character in [duoxX]; u acts the same as d
14485 *
14486 * CAUTION: o, x and X conversions on regular ints can never
14487 * produce a '-' sign, but can for Python's unbounded ints.
14488 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014489PyObject *
14490_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014491{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014492 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014493 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014494 Py_ssize_t i;
14495 int sign; /* 1 if '-', else 0 */
14496 int len; /* number of characters */
14497 Py_ssize_t llen;
14498 int numdigits; /* len == numnondigits + numdigits */
14499 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014500
Victor Stinnerd0880d52012-04-27 23:40:13 +020014501 /* Avoid exceeding SSIZE_T_MAX */
14502 if (prec > INT_MAX-3) {
14503 PyErr_SetString(PyExc_OverflowError,
14504 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014505 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014506 }
14507
14508 assert(PyLong_Check(val));
14509
14510 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014511 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014512 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014513 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014514 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014515 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014516 /* int and int subclasses should print numerically when a numeric */
14517 /* format code is used (see issue18780) */
14518 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014519 break;
14520 case 'o':
14521 numnondigits = 2;
14522 result = PyNumber_ToBase(val, 8);
14523 break;
14524 case 'x':
14525 case 'X':
14526 numnondigits = 2;
14527 result = PyNumber_ToBase(val, 16);
14528 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014529 }
14530 if (!result)
14531 return NULL;
14532
14533 assert(unicode_modifiable(result));
14534 assert(PyUnicode_IS_READY(result));
14535 assert(PyUnicode_IS_ASCII(result));
14536
14537 /* To modify the string in-place, there can only be one reference. */
14538 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014539 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014540 PyErr_BadInternalCall();
14541 return NULL;
14542 }
14543 buf = PyUnicode_DATA(result);
14544 llen = PyUnicode_GET_LENGTH(result);
14545 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014546 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014547 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014548 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014549 return NULL;
14550 }
14551 len = (int)llen;
14552 sign = buf[0] == '-';
14553 numnondigits += sign;
14554 numdigits = len - numnondigits;
14555 assert(numdigits > 0);
14556
14557 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014558 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014559 (type == 'o' || type == 'x' || type == 'X'))) {
14560 assert(buf[sign] == '0');
14561 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14562 buf[sign+1] == 'o');
14563 numnondigits -= 2;
14564 buf += 2;
14565 len -= 2;
14566 if (sign)
14567 buf[0] = '-';
14568 assert(len == numnondigits + numdigits);
14569 assert(numdigits > 0);
14570 }
14571
14572 /* Fill with leading zeroes to meet minimum width. */
14573 if (prec > numdigits) {
14574 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14575 numnondigits + prec);
14576 char *b1;
14577 if (!r1) {
14578 Py_DECREF(result);
14579 return NULL;
14580 }
14581 b1 = PyBytes_AS_STRING(r1);
14582 for (i = 0; i < numnondigits; ++i)
14583 *b1++ = *buf++;
14584 for (i = 0; i < prec - numdigits; i++)
14585 *b1++ = '0';
14586 for (i = 0; i < numdigits; i++)
14587 *b1++ = *buf++;
14588 *b1 = '\0';
14589 Py_DECREF(result);
14590 result = r1;
14591 buf = PyBytes_AS_STRING(result);
14592 len = numnondigits + prec;
14593 }
14594
14595 /* Fix up case for hex conversions. */
14596 if (type == 'X') {
14597 /* Need to convert all lower case letters to upper case.
14598 and need to convert 0x to 0X (and -0x to -0X). */
14599 for (i = 0; i < len; i++)
14600 if (buf[i] >= 'a' && buf[i] <= 'x')
14601 buf[i] -= 'a'-'A';
14602 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014603 if (!PyUnicode_Check(result)
14604 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014605 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014606 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014607 Py_DECREF(result);
14608 result = unicode;
14609 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014610 else if (len != PyUnicode_GET_LENGTH(result)) {
14611 if (PyUnicode_Resize(&result, len) < 0)
14612 Py_CLEAR(result);
14613 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014614 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014615}
14616
Ethan Furmandf3ed242014-01-05 06:50:30 -080014617/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014618 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014619 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014620 * -1 and raise an exception on error */
14621static int
Victor Stinnera47082312012-10-04 02:19:54 +020014622mainformatlong(PyObject *v,
14623 struct unicode_format_arg_t *arg,
14624 PyObject **p_output,
14625 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014626{
14627 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014628 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014629
14630 if (!PyNumber_Check(v))
14631 goto wrongtype;
14632
Ethan Furman9ab74802014-03-21 06:38:46 -070014633 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014634 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014635 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014636 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014637 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014638 if (PyErr_ExceptionMatches(PyExc_TypeError))
14639 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014640 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014641 }
14642 }
14643 else {
14644 iobj = PyNumber_Long(v);
14645 if (iobj == NULL ) {
14646 if (PyErr_ExceptionMatches(PyExc_TypeError))
14647 goto wrongtype;
14648 return -1;
14649 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014650 }
14651 assert(PyLong_Check(iobj));
14652 }
14653 else {
14654 iobj = v;
14655 Py_INCREF(iobj);
14656 }
14657
14658 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014659 && arg->width == -1 && arg->prec == -1
14660 && !(arg->flags & (F_SIGN | F_BLANK))
14661 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014662 {
14663 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014664 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014665 int base;
14666
Victor Stinnera47082312012-10-04 02:19:54 +020014667 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014668 {
14669 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014670 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014671 case 'd':
14672 case 'i':
14673 case 'u':
14674 base = 10;
14675 break;
14676 case 'o':
14677 base = 8;
14678 break;
14679 case 'x':
14680 case 'X':
14681 base = 16;
14682 break;
14683 }
14684
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014685 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14686 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014687 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014688 }
14689 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014690 return 1;
14691 }
14692
Ethan Furmanb95b5612015-01-23 20:05:18 -080014693 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014694 Py_DECREF(iobj);
14695 if (res == NULL)
14696 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014697 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014698 return 0;
14699
14700wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014701 switch(type)
14702 {
14703 case 'o':
14704 case 'x':
14705 case 'X':
14706 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014707 "%%%c format: an integer is required, "
14708 "not %.200s",
14709 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014710 break;
14711 default:
14712 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014713 "%%%c format: a number is required, "
14714 "not %.200s",
14715 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014716 break;
14717 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014718 return -1;
14719}
14720
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014721static Py_UCS4
14722formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014723{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014724 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014725 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014726 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014727 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014728 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014729 goto onError;
14730 }
14731 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014732 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014733 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014734 /* make sure number is a type of integer */
14735 if (!PyLong_Check(v)) {
14736 iobj = PyNumber_Index(v);
14737 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014738 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014739 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014740 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014741 Py_DECREF(iobj);
14742 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014743 else {
14744 x = PyLong_AsLong(v);
14745 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014746 if (x == -1 && PyErr_Occurred())
14747 goto onError;
14748
Victor Stinner8faf8212011-12-08 22:14:11 +010014749 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014750 PyErr_SetString(PyExc_OverflowError,
14751 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014752 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014753 }
14754
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014755 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014756 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014757
Benjamin Peterson29060642009-01-31 22:14:21 +000014758 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014759 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014760 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014761 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014762}
14763
Victor Stinnera47082312012-10-04 02:19:54 +020014764/* Parse options of an argument: flags, width, precision.
14765 Handle also "%(name)" syntax.
14766
14767 Return 0 if the argument has been formatted into arg->str.
14768 Return 1 if the argument has been written into ctx->writer,
14769 Raise an exception and return -1 on error. */
14770static int
14771unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14772 struct unicode_format_arg_t *arg)
14773{
14774#define FORMAT_READ(ctx) \
14775 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14776
14777 PyObject *v;
14778
Victor Stinnera47082312012-10-04 02:19:54 +020014779 if (arg->ch == '(') {
14780 /* Get argument value from a dictionary. Example: "%(name)s". */
14781 Py_ssize_t keystart;
14782 Py_ssize_t keylen;
14783 PyObject *key;
14784 int pcount = 1;
14785
14786 if (ctx->dict == NULL) {
14787 PyErr_SetString(PyExc_TypeError,
14788 "format requires a mapping");
14789 return -1;
14790 }
14791 ++ctx->fmtpos;
14792 --ctx->fmtcnt;
14793 keystart = ctx->fmtpos;
14794 /* Skip over balanced parentheses */
14795 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14796 arg->ch = FORMAT_READ(ctx);
14797 if (arg->ch == ')')
14798 --pcount;
14799 else if (arg->ch == '(')
14800 ++pcount;
14801 ctx->fmtpos++;
14802 }
14803 keylen = ctx->fmtpos - keystart - 1;
14804 if (ctx->fmtcnt < 0 || pcount > 0) {
14805 PyErr_SetString(PyExc_ValueError,
14806 "incomplete format key");
14807 return -1;
14808 }
14809 key = PyUnicode_Substring(ctx->fmtstr,
14810 keystart, keystart + keylen);
14811 if (key == NULL)
14812 return -1;
14813 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014814 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014815 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014816 }
14817 ctx->args = PyObject_GetItem(ctx->dict, key);
14818 Py_DECREF(key);
14819 if (ctx->args == NULL)
14820 return -1;
14821 ctx->args_owned = 1;
14822 ctx->arglen = -1;
14823 ctx->argidx = -2;
14824 }
14825
14826 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014827 while (--ctx->fmtcnt >= 0) {
14828 arg->ch = FORMAT_READ(ctx);
14829 ctx->fmtpos++;
14830 switch (arg->ch) {
14831 case '-': arg->flags |= F_LJUST; continue;
14832 case '+': arg->flags |= F_SIGN; continue;
14833 case ' ': arg->flags |= F_BLANK; continue;
14834 case '#': arg->flags |= F_ALT; continue;
14835 case '0': arg->flags |= F_ZERO; continue;
14836 }
14837 break;
14838 }
14839
14840 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014841 if (arg->ch == '*') {
14842 v = unicode_format_getnextarg(ctx);
14843 if (v == NULL)
14844 return -1;
14845 if (!PyLong_Check(v)) {
14846 PyErr_SetString(PyExc_TypeError,
14847 "* wants int");
14848 return -1;
14849 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014850 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014851 if (arg->width == -1 && PyErr_Occurred())
14852 return -1;
14853 if (arg->width < 0) {
14854 arg->flags |= F_LJUST;
14855 arg->width = -arg->width;
14856 }
14857 if (--ctx->fmtcnt >= 0) {
14858 arg->ch = FORMAT_READ(ctx);
14859 ctx->fmtpos++;
14860 }
14861 }
14862 else if (arg->ch >= '0' && arg->ch <= '9') {
14863 arg->width = arg->ch - '0';
14864 while (--ctx->fmtcnt >= 0) {
14865 arg->ch = FORMAT_READ(ctx);
14866 ctx->fmtpos++;
14867 if (arg->ch < '0' || arg->ch > '9')
14868 break;
14869 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14870 mixing signed and unsigned comparison. Since arg->ch is between
14871 '0' and '9', casting to int is safe. */
14872 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14873 PyErr_SetString(PyExc_ValueError,
14874 "width too big");
14875 return -1;
14876 }
14877 arg->width = arg->width*10 + (arg->ch - '0');
14878 }
14879 }
14880
14881 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014882 if (arg->ch == '.') {
14883 arg->prec = 0;
14884 if (--ctx->fmtcnt >= 0) {
14885 arg->ch = FORMAT_READ(ctx);
14886 ctx->fmtpos++;
14887 }
14888 if (arg->ch == '*') {
14889 v = unicode_format_getnextarg(ctx);
14890 if (v == NULL)
14891 return -1;
14892 if (!PyLong_Check(v)) {
14893 PyErr_SetString(PyExc_TypeError,
14894 "* wants int");
14895 return -1;
14896 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014897 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014898 if (arg->prec == -1 && PyErr_Occurred())
14899 return -1;
14900 if (arg->prec < 0)
14901 arg->prec = 0;
14902 if (--ctx->fmtcnt >= 0) {
14903 arg->ch = FORMAT_READ(ctx);
14904 ctx->fmtpos++;
14905 }
14906 }
14907 else if (arg->ch >= '0' && arg->ch <= '9') {
14908 arg->prec = arg->ch - '0';
14909 while (--ctx->fmtcnt >= 0) {
14910 arg->ch = FORMAT_READ(ctx);
14911 ctx->fmtpos++;
14912 if (arg->ch < '0' || arg->ch > '9')
14913 break;
14914 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14915 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014916 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014917 return -1;
14918 }
14919 arg->prec = arg->prec*10 + (arg->ch - '0');
14920 }
14921 }
14922 }
14923
14924 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14925 if (ctx->fmtcnt >= 0) {
14926 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14927 if (--ctx->fmtcnt >= 0) {
14928 arg->ch = FORMAT_READ(ctx);
14929 ctx->fmtpos++;
14930 }
14931 }
14932 }
14933 if (ctx->fmtcnt < 0) {
14934 PyErr_SetString(PyExc_ValueError,
14935 "incomplete format");
14936 return -1;
14937 }
14938 return 0;
14939
14940#undef FORMAT_READ
14941}
14942
14943/* Format one argument. Supported conversion specifiers:
14944
14945 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014946 - "i", "d", "u": int or float
14947 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014948 - "e", "E", "f", "F", "g", "G": float
14949 - "c": int or str (1 character)
14950
Victor Stinner8dbd4212012-12-04 09:30:24 +010014951 When possible, the output is written directly into the Unicode writer
14952 (ctx->writer). A string is created when padding is required.
14953
Victor Stinnera47082312012-10-04 02:19:54 +020014954 Return 0 if the argument has been formatted into *p_str,
14955 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014956 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014957static int
14958unicode_format_arg_format(struct unicode_formatter_t *ctx,
14959 struct unicode_format_arg_t *arg,
14960 PyObject **p_str)
14961{
14962 PyObject *v;
14963 _PyUnicodeWriter *writer = &ctx->writer;
14964
14965 if (ctx->fmtcnt == 0)
14966 ctx->writer.overallocate = 0;
14967
Victor Stinnera47082312012-10-04 02:19:54 +020014968 v = unicode_format_getnextarg(ctx);
14969 if (v == NULL)
14970 return -1;
14971
Victor Stinnera47082312012-10-04 02:19:54 +020014972
14973 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014974 case 's':
14975 case 'r':
14976 case 'a':
14977 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14978 /* Fast path */
14979 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14980 return -1;
14981 return 1;
14982 }
14983
14984 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14985 *p_str = v;
14986 Py_INCREF(*p_str);
14987 }
14988 else {
14989 if (arg->ch == 's')
14990 *p_str = PyObject_Str(v);
14991 else if (arg->ch == 'r')
14992 *p_str = PyObject_Repr(v);
14993 else
14994 *p_str = PyObject_ASCII(v);
14995 }
14996 break;
14997
14998 case 'i':
14999 case 'd':
15000 case 'u':
15001 case 'o':
15002 case 'x':
15003 case 'X':
15004 {
15005 int ret = mainformatlong(v, arg, p_str, writer);
15006 if (ret != 0)
15007 return ret;
15008 arg->sign = 1;
15009 break;
15010 }
15011
15012 case 'e':
15013 case 'E':
15014 case 'f':
15015 case 'F':
15016 case 'g':
15017 case 'G':
15018 if (arg->width == -1 && arg->prec == -1
15019 && !(arg->flags & (F_SIGN | F_BLANK)))
15020 {
15021 /* Fast path */
15022 if (formatfloat(v, arg, NULL, writer) == -1)
15023 return -1;
15024 return 1;
15025 }
15026
15027 arg->sign = 1;
15028 if (formatfloat(v, arg, p_str, NULL) == -1)
15029 return -1;
15030 break;
15031
15032 case 'c':
15033 {
15034 Py_UCS4 ch = formatchar(v);
15035 if (ch == (Py_UCS4) -1)
15036 return -1;
15037 if (arg->width == -1 && arg->prec == -1) {
15038 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015039 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015040 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015041 return 1;
15042 }
15043 *p_str = PyUnicode_FromOrdinal(ch);
15044 break;
15045 }
15046
15047 default:
15048 PyErr_Format(PyExc_ValueError,
15049 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015050 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015051 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15052 (int)arg->ch,
15053 ctx->fmtpos - 1);
15054 return -1;
15055 }
15056 if (*p_str == NULL)
15057 return -1;
15058 assert (PyUnicode_Check(*p_str));
15059 return 0;
15060}
15061
15062static int
15063unicode_format_arg_output(struct unicode_formatter_t *ctx,
15064 struct unicode_format_arg_t *arg,
15065 PyObject *str)
15066{
15067 Py_ssize_t len;
15068 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015069 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015070 Py_ssize_t pindex;
15071 Py_UCS4 signchar;
15072 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015073 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015074 Py_ssize_t sublen;
15075 _PyUnicodeWriter *writer = &ctx->writer;
15076 Py_UCS4 fill;
15077
15078 fill = ' ';
15079 if (arg->sign && arg->flags & F_ZERO)
15080 fill = '0';
15081
15082 if (PyUnicode_READY(str) == -1)
15083 return -1;
15084
15085 len = PyUnicode_GET_LENGTH(str);
15086 if ((arg->width == -1 || arg->width <= len)
15087 && (arg->prec == -1 || arg->prec >= len)
15088 && !(arg->flags & (F_SIGN | F_BLANK)))
15089 {
15090 /* Fast path */
15091 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15092 return -1;
15093 return 0;
15094 }
15095
15096 /* Truncate the string for "s", "r" and "a" formats
15097 if the precision is set */
15098 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15099 if (arg->prec >= 0 && len > arg->prec)
15100 len = arg->prec;
15101 }
15102
15103 /* Adjust sign and width */
15104 kind = PyUnicode_KIND(str);
15105 pbuf = PyUnicode_DATA(str);
15106 pindex = 0;
15107 signchar = '\0';
15108 if (arg->sign) {
15109 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15110 if (ch == '-' || ch == '+') {
15111 signchar = ch;
15112 len--;
15113 pindex++;
15114 }
15115 else if (arg->flags & F_SIGN)
15116 signchar = '+';
15117 else if (arg->flags & F_BLANK)
15118 signchar = ' ';
15119 else
15120 arg->sign = 0;
15121 }
15122 if (arg->width < len)
15123 arg->width = len;
15124
15125 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015126 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015127 if (!(arg->flags & F_LJUST)) {
15128 if (arg->sign) {
15129 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015130 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015131 }
15132 else {
15133 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015134 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015135 }
15136 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015137 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15138 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015139 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015140 }
15141
Victor Stinnera47082312012-10-04 02:19:54 +020015142 buflen = arg->width;
15143 if (arg->sign && len == arg->width)
15144 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015145 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015146 return -1;
15147
15148 /* Write the sign if needed */
15149 if (arg->sign) {
15150 if (fill != ' ') {
15151 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15152 writer->pos += 1;
15153 }
15154 if (arg->width > len)
15155 arg->width--;
15156 }
15157
15158 /* Write the numeric prefix for "x", "X" and "o" formats
15159 if the alternate form is used.
15160 For example, write "0x" for the "%#x" format. */
15161 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15162 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15163 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15164 if (fill != ' ') {
15165 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15166 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15167 writer->pos += 2;
15168 pindex += 2;
15169 }
15170 arg->width -= 2;
15171 if (arg->width < 0)
15172 arg->width = 0;
15173 len -= 2;
15174 }
15175
15176 /* Pad left with the fill character if needed */
15177 if (arg->width > len && !(arg->flags & F_LJUST)) {
15178 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015179 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015180 writer->pos += sublen;
15181 arg->width = len;
15182 }
15183
15184 /* If padding with spaces: write sign if needed and/or numeric prefix if
15185 the alternate form is used */
15186 if (fill == ' ') {
15187 if (arg->sign) {
15188 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15189 writer->pos += 1;
15190 }
15191 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15192 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15193 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15194 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15195 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15196 writer->pos += 2;
15197 pindex += 2;
15198 }
15199 }
15200
15201 /* Write characters */
15202 if (len) {
15203 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15204 str, pindex, len);
15205 writer->pos += len;
15206 }
15207
15208 /* Pad right with the fill character if needed */
15209 if (arg->width > len) {
15210 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015211 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015212 writer->pos += sublen;
15213 }
15214 return 0;
15215}
15216
15217/* Helper of PyUnicode_Format(): format one arg.
15218 Return 0 on success, raise an exception and return -1 on error. */
15219static int
15220unicode_format_arg(struct unicode_formatter_t *ctx)
15221{
15222 struct unicode_format_arg_t arg;
15223 PyObject *str;
15224 int ret;
15225
Victor Stinner8dbd4212012-12-04 09:30:24 +010015226 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015227 if (arg.ch == '%') {
15228 ctx->fmtpos++;
15229 ctx->fmtcnt--;
15230 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15231 return -1;
15232 return 0;
15233 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015234 arg.flags = 0;
15235 arg.width = -1;
15236 arg.prec = -1;
15237 arg.sign = 0;
15238 str = NULL;
15239
Victor Stinnera47082312012-10-04 02:19:54 +020015240 ret = unicode_format_arg_parse(ctx, &arg);
15241 if (ret == -1)
15242 return -1;
15243
15244 ret = unicode_format_arg_format(ctx, &arg, &str);
15245 if (ret == -1)
15246 return -1;
15247
15248 if (ret != 1) {
15249 ret = unicode_format_arg_output(ctx, &arg, str);
15250 Py_DECREF(str);
15251 if (ret == -1)
15252 return -1;
15253 }
15254
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015255 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015256 PyErr_SetString(PyExc_TypeError,
15257 "not all arguments converted during string formatting");
15258 return -1;
15259 }
15260 return 0;
15261}
15262
Alexander Belopolsky40018472011-02-26 01:02:56 +000015263PyObject *
15264PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015265{
Victor Stinnera47082312012-10-04 02:19:54 +020015266 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015267
Guido van Rossumd57fd912000-03-10 22:53:23 +000015268 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015269 PyErr_BadInternalCall();
15270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015271 }
Victor Stinnera47082312012-10-04 02:19:54 +020015272
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015273 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015274 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015275
15276 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015277 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15278 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15279 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15280 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015281
Victor Stinner8f674cc2013-04-17 23:02:17 +020015282 _PyUnicodeWriter_Init(&ctx.writer);
15283 ctx.writer.min_length = ctx.fmtcnt + 100;
15284 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015285
Guido van Rossumd57fd912000-03-10 22:53:23 +000015286 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015287 ctx.arglen = PyTuple_Size(args);
15288 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015289 }
15290 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015291 ctx.arglen = -1;
15292 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015293 }
Victor Stinnera47082312012-10-04 02:19:54 +020015294 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015295 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015296 ctx.dict = args;
15297 else
15298 ctx.dict = NULL;
15299 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015300
Victor Stinnera47082312012-10-04 02:19:54 +020015301 while (--ctx.fmtcnt >= 0) {
15302 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015303 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015304
15305 nonfmtpos = ctx.fmtpos++;
15306 while (ctx.fmtcnt >= 0 &&
15307 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15308 ctx.fmtpos++;
15309 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015310 }
Victor Stinnera47082312012-10-04 02:19:54 +020015311 if (ctx.fmtcnt < 0) {
15312 ctx.fmtpos--;
15313 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015314 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015315
Victor Stinnercfc4c132013-04-03 01:48:39 +020015316 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15317 nonfmtpos, ctx.fmtpos) < 0)
15318 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 }
15320 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015321 ctx.fmtpos++;
15322 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015323 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015324 }
15325 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015326
Victor Stinnera47082312012-10-04 02:19:54 +020015327 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015328 PyErr_SetString(PyExc_TypeError,
15329 "not all arguments converted during string formatting");
15330 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015331 }
15332
Victor Stinnera47082312012-10-04 02:19:54 +020015333 if (ctx.args_owned) {
15334 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015335 }
Victor Stinnera47082312012-10-04 02:19:54 +020015336 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015337
Benjamin Peterson29060642009-01-31 22:14:21 +000015338 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015339 _PyUnicodeWriter_Dealloc(&ctx.writer);
15340 if (ctx.args_owned) {
15341 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015342 }
15343 return NULL;
15344}
15345
Jeremy Hylton938ace62002-07-17 16:30:39 +000015346static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015347unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15348
Tim Peters6d6c1a32001-08-02 04:15:00 +000015349static PyObject *
15350unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15351{
Benjamin Peterson29060642009-01-31 22:14:21 +000015352 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 static char *kwlist[] = {"object", "encoding", "errors", 0};
15354 char *encoding = NULL;
15355 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015356
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 if (type != &PyUnicode_Type)
15358 return unicode_subtype_new(type, args, kwds);
15359 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015360 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 return NULL;
15362 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015363 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 if (encoding == NULL && errors == NULL)
15365 return PyObject_Str(x);
15366 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015367 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015368}
15369
Guido van Rossume023fe02001-08-30 03:12:59 +000015370static PyObject *
15371unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15372{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015373 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015374 Py_ssize_t length, char_size;
15375 int share_wstr, share_utf8;
15376 unsigned int kind;
15377 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015378
Benjamin Peterson14339b62009-01-31 16:36:08 +000015379 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015380
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015381 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015382 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015384 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015385 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015386 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015387 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015388 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015389
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015390 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015391 if (self == NULL) {
15392 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 return NULL;
15394 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015395 kind = PyUnicode_KIND(unicode);
15396 length = PyUnicode_GET_LENGTH(unicode);
15397
15398 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015399#ifdef Py_DEBUG
15400 _PyUnicode_HASH(self) = -1;
15401#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015402 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015403#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015404 _PyUnicode_STATE(self).interned = 0;
15405 _PyUnicode_STATE(self).kind = kind;
15406 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015407 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015408 _PyUnicode_STATE(self).ready = 1;
15409 _PyUnicode_WSTR(self) = NULL;
15410 _PyUnicode_UTF8_LENGTH(self) = 0;
15411 _PyUnicode_UTF8(self) = NULL;
15412 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015413 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015414
15415 share_utf8 = 0;
15416 share_wstr = 0;
15417 if (kind == PyUnicode_1BYTE_KIND) {
15418 char_size = 1;
15419 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15420 share_utf8 = 1;
15421 }
15422 else if (kind == PyUnicode_2BYTE_KIND) {
15423 char_size = 2;
15424 if (sizeof(wchar_t) == 2)
15425 share_wstr = 1;
15426 }
15427 else {
15428 assert(kind == PyUnicode_4BYTE_KIND);
15429 char_size = 4;
15430 if (sizeof(wchar_t) == 4)
15431 share_wstr = 1;
15432 }
15433
15434 /* Ensure we won't overflow the length. */
15435 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15436 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015437 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015438 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015439 data = PyObject_MALLOC((length + 1) * char_size);
15440 if (data == NULL) {
15441 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015442 goto onError;
15443 }
15444
Victor Stinnerc3c74152011-10-02 20:39:55 +020015445 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015446 if (share_utf8) {
15447 _PyUnicode_UTF8_LENGTH(self) = length;
15448 _PyUnicode_UTF8(self) = data;
15449 }
15450 if (share_wstr) {
15451 _PyUnicode_WSTR_LENGTH(self) = length;
15452 _PyUnicode_WSTR(self) = (wchar_t *)data;
15453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015454
Christian Heimesf051e432016-09-13 20:22:02 +020015455 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015456 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015457 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015458#ifdef Py_DEBUG
15459 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15460#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015461 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015462 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015463
15464onError:
15465 Py_DECREF(unicode);
15466 Py_DECREF(self);
15467 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015468}
15469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015470PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015471"str(object='') -> str\n\
15472str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015473\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015474Create a new string object from the given object. If encoding or\n\
15475errors is specified, then the object must expose a data buffer\n\
15476that will be decoded using the given encoding and error handler.\n\
15477Otherwise, returns the result of object.__str__() (if defined)\n\
15478or repr(object).\n\
15479encoding defaults to sys.getdefaultencoding().\n\
15480errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015481
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015482static PyObject *unicode_iter(PyObject *seq);
15483
Guido van Rossumd57fd912000-03-10 22:53:23 +000015484PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015485 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015486 "str", /* tp_name */
15487 sizeof(PyUnicodeObject), /* tp_basicsize */
15488 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015489 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015490 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015491 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015492 0, /* tp_getattr */
15493 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015494 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015495 unicode_repr, /* tp_repr */
15496 &unicode_as_number, /* tp_as_number */
15497 &unicode_as_sequence, /* tp_as_sequence */
15498 &unicode_as_mapping, /* tp_as_mapping */
15499 (hashfunc) unicode_hash, /* tp_hash*/
15500 0, /* tp_call*/
15501 (reprfunc) unicode_str, /* tp_str */
15502 PyObject_GenericGetAttr, /* tp_getattro */
15503 0, /* tp_setattro */
15504 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015506 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15507 unicode_doc, /* tp_doc */
15508 0, /* tp_traverse */
15509 0, /* tp_clear */
15510 PyUnicode_RichCompare, /* tp_richcompare */
15511 0, /* tp_weaklistoffset */
15512 unicode_iter, /* tp_iter */
15513 0, /* tp_iternext */
15514 unicode_methods, /* tp_methods */
15515 0, /* tp_members */
15516 0, /* tp_getset */
15517 &PyBaseObject_Type, /* tp_base */
15518 0, /* tp_dict */
15519 0, /* tp_descr_get */
15520 0, /* tp_descr_set */
15521 0, /* tp_dictoffset */
15522 0, /* tp_init */
15523 0, /* tp_alloc */
15524 unicode_new, /* tp_new */
15525 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015526};
15527
15528/* Initialize the Unicode implementation */
15529
Victor Stinner331a6a52019-05-27 16:39:22 +020015530PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015531_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015532{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015533 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015534 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015535 0x000A, /* LINE FEED */
15536 0x000D, /* CARRIAGE RETURN */
15537 0x001C, /* FILE SEPARATOR */
15538 0x001D, /* GROUP SEPARATOR */
15539 0x001E, /* RECORD SEPARATOR */
15540 0x0085, /* NEXT LINE */
15541 0x2028, /* LINE SEPARATOR */
15542 0x2029, /* PARAGRAPH SEPARATOR */
15543 };
15544
Victor Stinner90ed8a62020-06-24 00:34:07 +020015545 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
15546 // optimized to always use state->empty without having to check if it is
15547 // NULL or not.
15548 PyObject *empty = PyUnicode_New(1, 0);
15549 if (empty == NULL) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015550 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015551 }
Victor Stinner90ed8a62020-06-24 00:34:07 +020015552 PyUnicode_1BYTE_DATA(empty)[0] = 0;
15553 _PyUnicode_LENGTH(empty) = 0;
15554 assert(_PyUnicode_CheckConsistency(empty, 1));
15555
15556 struct _Py_unicode_state *state = &tstate->interp->unicode;
15557 assert(state->empty == NULL);
15558 state->empty = empty;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015559
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015560 if (_Py_IsMainInterpreter(tstate)) {
15561 /* initialize the linebreak bloom filter */
15562 bloom_linebreak = make_bloom_mask(
15563 PyUnicode_2BYTE_KIND, linebreak,
15564 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015565
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015566 if (PyType_Ready(&PyUnicode_Type) < 0) {
15567 return _PyStatus_ERR("Can't initialize unicode type");
15568 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015569
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015570 if (PyType_Ready(&EncodingMapType) < 0) {
15571 return _PyStatus_ERR("Can't initialize encoding map type");
15572 }
15573 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15574 return _PyStatus_ERR("Can't initialize field name iterator type");
15575 }
15576 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15577 return _PyStatus_ERR("Can't initialize formatter iter type");
15578 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015579 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015580 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015581}
15582
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015583
Walter Dörwald16807132007-05-25 13:52:07 +000015584void
15585PyUnicode_InternInPlace(PyObject **p)
15586{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015587 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015588#ifdef Py_DEBUG
15589 assert(s != NULL);
15590 assert(_PyUnicode_CHECK(s));
15591#else
Victor Stinner607b1022020-05-05 18:50:30 +020015592 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015593 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015594 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015595#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015596
Benjamin Peterson14339b62009-01-31 16:36:08 +000015597 /* If it's a subclass, we don't really know what putting
15598 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015599 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015600 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015601 }
15602
15603 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015604 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015605 }
15606
15607#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015608 if (interned == NULL) {
15609 interned = PyDict_New();
15610 if (interned == NULL) {
15611 PyErr_Clear(); /* Don't leave an exception */
15612 return;
15613 }
15614 }
Victor Stinner607b1022020-05-05 18:50:30 +020015615
15616 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015617 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015618 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015619 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015620
Berker Peksagced8d4c2016-07-25 04:40:39 +030015621 if (t == NULL) {
15622 PyErr_Clear();
15623 return;
15624 }
Victor Stinner607b1022020-05-05 18:50:30 +020015625
Berker Peksagced8d4c2016-07-25 04:40:39 +030015626 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015627 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015628 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015629 return;
15630 }
Victor Stinner607b1022020-05-05 18:50:30 +020015631
Benjamin Peterson14339b62009-01-31 16:36:08 +000015632 /* The two references in interned are not counted by refcnt.
15633 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015634 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015635 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015636#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015637}
15638
15639void
15640PyUnicode_InternImmortal(PyObject **p)
15641{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015642 PyUnicode_InternInPlace(p);
15643 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015644 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015645 Py_INCREF(*p);
15646 }
Walter Dörwald16807132007-05-25 13:52:07 +000015647}
15648
15649PyObject *
15650PyUnicode_InternFromString(const char *cp)
15651{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015652 PyObject *s = PyUnicode_FromString(cp);
15653 if (s == NULL)
15654 return NULL;
15655 PyUnicode_InternInPlace(&s);
15656 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015657}
15658
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015659
15660#if defined(WITH_VALGRIND) || defined(__INSURE__)
15661static void
15662unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015663{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015664 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015665 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015666 }
15667 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015668 if (keys == NULL || !PyList_Check(keys)) {
15669 PyErr_Clear();
15670 return;
15671 }
Walter Dörwald16807132007-05-25 13:52:07 +000015672
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015673 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 detector, interned unicode strings are not forcibly deallocated;
15675 rather, we give them their stolen references back, and then clear
15676 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015677
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015678 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015679#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015680 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015681
15682 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015683#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015684 for (Py_ssize_t i = 0; i < n; i++) {
15685 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015686 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015687 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015689 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015690 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015691 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015692#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015693 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015694#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015695 break;
15696 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015697 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015698#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015699 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015700#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015701 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015702 case SSTATE_NOT_INTERNED:
15703 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015704 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015705 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015707 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015708 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015709#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015710 fprintf(stderr,
15711 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15712 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015714 Py_DECREF(keys);
15715 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015716 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015717}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015718#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015719
15720
15721/********************* Unicode Iterator **************************/
15722
15723typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015724 PyObject_HEAD
15725 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015726 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015727} unicodeiterobject;
15728
15729static void
15730unicodeiter_dealloc(unicodeiterobject *it)
15731{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015732 _PyObject_GC_UNTRACK(it);
15733 Py_XDECREF(it->it_seq);
15734 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015735}
15736
15737static int
15738unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15739{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015740 Py_VISIT(it->it_seq);
15741 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015742}
15743
15744static PyObject *
15745unicodeiter_next(unicodeiterobject *it)
15746{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015747 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015748
Benjamin Peterson14339b62009-01-31 16:36:08 +000015749 assert(it != NULL);
15750 seq = it->it_seq;
15751 if (seq == NULL)
15752 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015753 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015755 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15756 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015757 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015758 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15759 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015760 if (item != NULL)
15761 ++it->it_index;
15762 return item;
15763 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015764
Benjamin Peterson14339b62009-01-31 16:36:08 +000015765 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015766 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015767 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015768}
15769
15770static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015771unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015772{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015773 Py_ssize_t len = 0;
15774 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015775 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015776 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015777}
15778
15779PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15780
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015781static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015782unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015783{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015784 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015785 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015786 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015787 it->it_seq, it->it_index);
15788 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015789 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015790 if (u == NULL)
15791 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015792 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015793 }
15794}
15795
15796PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15797
15798static PyObject *
15799unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15800{
15801 Py_ssize_t index = PyLong_AsSsize_t(state);
15802 if (index == -1 && PyErr_Occurred())
15803 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015804 if (it->it_seq != NULL) {
15805 if (index < 0)
15806 index = 0;
15807 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15808 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15809 it->it_index = index;
15810 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015811 Py_RETURN_NONE;
15812}
15813
15814PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15815
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015816static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015817 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015818 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015819 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15820 reduce_doc},
15821 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15822 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015823 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015824};
15825
15826PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015827 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15828 "str_iterator", /* tp_name */
15829 sizeof(unicodeiterobject), /* tp_basicsize */
15830 0, /* tp_itemsize */
15831 /* methods */
15832 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015833 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015834 0, /* tp_getattr */
15835 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015836 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015837 0, /* tp_repr */
15838 0, /* tp_as_number */
15839 0, /* tp_as_sequence */
15840 0, /* tp_as_mapping */
15841 0, /* tp_hash */
15842 0, /* tp_call */
15843 0, /* tp_str */
15844 PyObject_GenericGetAttr, /* tp_getattro */
15845 0, /* tp_setattro */
15846 0, /* tp_as_buffer */
15847 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15848 0, /* tp_doc */
15849 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15850 0, /* tp_clear */
15851 0, /* tp_richcompare */
15852 0, /* tp_weaklistoffset */
15853 PyObject_SelfIter, /* tp_iter */
15854 (iternextfunc)unicodeiter_next, /* tp_iternext */
15855 unicodeiter_methods, /* tp_methods */
15856 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015857};
15858
15859static PyObject *
15860unicode_iter(PyObject *seq)
15861{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015862 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015863
Benjamin Peterson14339b62009-01-31 16:36:08 +000015864 if (!PyUnicode_Check(seq)) {
15865 PyErr_BadInternalCall();
15866 return NULL;
15867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015868 if (PyUnicode_READY(seq) == -1)
15869 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015870 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15871 if (it == NULL)
15872 return NULL;
15873 it->it_index = 0;
15874 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015875 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015876 _PyObject_GC_TRACK(it);
15877 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015878}
15879
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015880
15881size_t
15882Py_UNICODE_strlen(const Py_UNICODE *u)
15883{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015884 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015885}
15886
15887Py_UNICODE*
15888Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15889{
15890 Py_UNICODE *u = s1;
15891 while ((*u++ = *s2++));
15892 return s1;
15893}
15894
15895Py_UNICODE*
15896Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15897{
15898 Py_UNICODE *u = s1;
15899 while ((*u++ = *s2++))
15900 if (n-- == 0)
15901 break;
15902 return s1;
15903}
15904
15905Py_UNICODE*
15906Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15907{
15908 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015909 u1 += wcslen(u1);
15910 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015911 return s1;
15912}
15913
15914int
15915Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15916{
15917 while (*s1 && *s2 && *s1 == *s2)
15918 s1++, s2++;
15919 if (*s1 && *s2)
15920 return (*s1 < *s2) ? -1 : +1;
15921 if (*s1)
15922 return 1;
15923 if (*s2)
15924 return -1;
15925 return 0;
15926}
15927
15928int
15929Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15930{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015931 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015932 for (; n != 0; n--) {
15933 u1 = *s1;
15934 u2 = *s2;
15935 if (u1 != u2)
15936 return (u1 < u2) ? -1 : +1;
15937 if (u1 == '\0')
15938 return 0;
15939 s1++;
15940 s2++;
15941 }
15942 return 0;
15943}
15944
15945Py_UNICODE*
15946Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15947{
15948 const Py_UNICODE *p;
15949 for (p = s; *p; p++)
15950 if (*p == c)
15951 return (Py_UNICODE*)p;
15952 return NULL;
15953}
15954
15955Py_UNICODE*
15956Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15957{
15958 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015959 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015960 while (p != s) {
15961 p--;
15962 if (*p == c)
15963 return (Py_UNICODE*)p;
15964 }
15965 return NULL;
15966}
Victor Stinner331ea922010-08-10 16:37:20 +000015967
Victor Stinner71133ff2010-09-01 23:43:53 +000015968Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015969PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015970{
Victor Stinner577db2c2011-10-11 22:12:48 +020015971 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015972 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015974 if (!PyUnicode_Check(unicode)) {
15975 PyErr_BadArgument();
15976 return NULL;
15977 }
Inada Naoki2c4928d2020-06-17 20:09:44 +090015978_Py_COMP_DIAG_PUSH
15979_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015980 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Inada Naoki2c4928d2020-06-17 20:09:44 +090015981_Py_COMP_DIAG_POP
Victor Stinner577db2c2011-10-11 22:12:48 +020015982 if (u == NULL)
15983 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015984 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015985 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015986 PyErr_NoMemory();
15987 return NULL;
15988 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015989 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015990 size *= sizeof(Py_UNICODE);
15991 copy = PyMem_Malloc(size);
15992 if (copy == NULL) {
15993 PyErr_NoMemory();
15994 return NULL;
15995 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015996 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015997 return copy;
15998}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015999
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016000
Victor Stinner709d23d2019-05-02 14:56:30 -040016001static int
16002encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016003{
Victor Stinner709d23d2019-05-02 14:56:30 -040016004 int res;
16005 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16006 if (res == -2) {
16007 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16008 return -1;
16009 }
16010 if (res < 0) {
16011 PyErr_NoMemory();
16012 return -1;
16013 }
16014 return 0;
16015}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016016
Victor Stinner709d23d2019-05-02 14:56:30 -040016017
16018static int
16019config_get_codec_name(wchar_t **config_encoding)
16020{
16021 char *encoding;
16022 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16023 return -1;
16024 }
16025
16026 PyObject *name_obj = NULL;
16027 PyObject *codec = _PyCodec_Lookup(encoding);
16028 PyMem_RawFree(encoding);
16029
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016030 if (!codec)
16031 goto error;
16032
16033 name_obj = PyObject_GetAttrString(codec, "name");
16034 Py_CLEAR(codec);
16035 if (!name_obj) {
16036 goto error;
16037 }
16038
Victor Stinner709d23d2019-05-02 14:56:30 -040016039 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16040 Py_DECREF(name_obj);
16041 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016042 goto error;
16043 }
16044
Victor Stinner709d23d2019-05-02 14:56:30 -040016045 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16046 if (raw_wname == NULL) {
16047 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016048 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016049 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016050 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016051
16052 PyMem_RawFree(*config_encoding);
16053 *config_encoding = raw_wname;
16054
16055 PyMem_Free(wname);
16056 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016057
16058error:
16059 Py_XDECREF(codec);
16060 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016061 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016062}
16063
16064
Victor Stinner331a6a52019-05-27 16:39:22 +020016065static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016066init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016067{
Victor Stinner709d23d2019-05-02 14:56:30 -040016068 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016069 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016070 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016071 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016072 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016073 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016074 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016075}
16076
16077
Victor Stinner709d23d2019-05-02 14:56:30 -040016078static int
16079init_fs_codec(PyInterpreterState *interp)
16080{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016081 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016082
16083 _Py_error_handler error_handler;
16084 error_handler = get_error_handler_wide(config->filesystem_errors);
16085 if (error_handler == _Py_ERROR_UNKNOWN) {
16086 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16087 return -1;
16088 }
16089
16090 char *encoding, *errors;
16091 if (encode_wstr_utf8(config->filesystem_encoding,
16092 &encoding,
16093 "filesystem_encoding") < 0) {
16094 return -1;
16095 }
16096
16097 if (encode_wstr_utf8(config->filesystem_errors,
16098 &errors,
16099 "filesystem_errors") < 0) {
16100 PyMem_RawFree(encoding);
16101 return -1;
16102 }
16103
Victor Stinner3d17c042020-05-14 01:48:38 +020016104 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16105 PyMem_RawFree(fs_codec->encoding);
16106 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016107 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016108 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16109 PyMem_RawFree(fs_codec->errors);
16110 fs_codec->errors = errors;
16111 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016112
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016113#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016114 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016115#endif
16116
Victor Stinner709d23d2019-05-02 14:56:30 -040016117 /* At this point, PyUnicode_EncodeFSDefault() and
16118 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16119 the C implementation of the filesystem encoding. */
16120
16121 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16122 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016123 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16124 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016125 PyErr_NoMemory();
16126 return -1;
16127 }
16128 return 0;
16129}
16130
16131
Victor Stinner331a6a52019-05-27 16:39:22 +020016132static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016133init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016134{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016135 PyInterpreterState *interp = tstate->interp;
16136
Victor Stinner709d23d2019-05-02 14:56:30 -040016137 /* Update the filesystem encoding to the normalized Python codec name.
16138 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16139 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016140 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016141 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016142 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016143 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016144 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016145 }
16146
Victor Stinner709d23d2019-05-02 14:56:30 -040016147 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016148 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016149 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016150 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016151}
16152
16153
Victor Stinner331a6a52019-05-27 16:39:22 +020016154PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016155_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016156{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016157 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016158 if (_PyStatus_EXCEPTION(status)) {
16159 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016160 }
16161
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016162 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016163}
16164
16165
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016166static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016167_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016168{
Victor Stinner3d17c042020-05-14 01:48:38 +020016169 PyMem_RawFree(fs_codec->encoding);
16170 fs_codec->encoding = NULL;
16171 fs_codec->utf8 = 0;
16172 PyMem_RawFree(fs_codec->errors);
16173 fs_codec->errors = NULL;
16174 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016175}
16176
16177
Victor Stinner709d23d2019-05-02 14:56:30 -040016178#ifdef MS_WINDOWS
16179int
16180_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16181{
Victor Stinner81a7be32020-04-14 15:14:01 +020016182 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016183 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016184
16185 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16186 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16187 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16188 if (encoding == NULL || errors == NULL) {
16189 PyMem_RawFree(encoding);
16190 PyMem_RawFree(errors);
16191 PyErr_NoMemory();
16192 return -1;
16193 }
16194
16195 PyMem_RawFree(config->filesystem_encoding);
16196 config->filesystem_encoding = encoding;
16197 PyMem_RawFree(config->filesystem_errors);
16198 config->filesystem_errors = errors;
16199
16200 return init_fs_codec(interp);
16201}
16202#endif
16203
16204
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016205void
Victor Stinner3d483342019-11-22 12:27:50 +010016206_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016207{
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016208 struct _Py_unicode_state *state = &tstate->interp->unicode;
16209
16210 int is_main_interp = _Py_IsMainInterpreter(tstate);
16211 if (is_main_interp) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016212#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016213 /* Insure++ is a memory analysis tool that aids in discovering
16214 * memory leaks and other memory problems. On Python exit, the
16215 * interned string dictionaries are flagged as being in use at exit
16216 * (which it is). Under normal circumstances, this is fine because
16217 * the memory will be automatically reclaimed by the system. Under
16218 * memory debugging, it's a huge source of useless noise, so we
16219 * trade off slower shutdown for less distraction in the memory
16220 * reports. -baw
16221 */
16222 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016223#endif /* __INSURE__ */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016224 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016225
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016226 Py_CLEAR(state->empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016227
Victor Stinner2f9ada92020-06-24 02:22:21 +020016228 for (Py_ssize_t i = 0; i < 256; i++) {
16229 Py_CLEAR(state->latin1[i]);
16230 }
16231
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016232 if (is_main_interp) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016233 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016234 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016235
Victor Stinner3d17c042020-05-14 01:48:38 +020016236 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016237}
16238
16239
Georg Brandl66c221e2010-10-14 07:04:07 +000016240/* A _string module, to export formatter_parser and formatter_field_name_split
16241 to the string.Formatter class implemented in Python. */
16242
16243static PyMethodDef _string_methods[] = {
16244 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16245 METH_O, PyDoc_STR("split the argument as a field name")},
16246 {"formatter_parser", (PyCFunction) formatter_parser,
16247 METH_O, PyDoc_STR("parse the argument as a format string")},
16248 {NULL, NULL}
16249};
16250
16251static struct PyModuleDef _string_module = {
16252 PyModuleDef_HEAD_INIT,
16253 "_string",
16254 PyDoc_STR("string helper module"),
16255 0,
16256 _string_methods,
16257 NULL,
16258 NULL,
16259 NULL,
16260 NULL
16261};
16262
16263PyMODINIT_FUNC
16264PyInit__string(void)
16265{
16266 return PyModule_Create(&_string_module);
16267}
16268
16269
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016270#ifdef __cplusplus
16271}
16272#endif