blob: c46ba4ae57dc6b9f5f60b024303140ffb00fbf37 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner91698d82020-06-25 14:07:40 +020044#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
Victor Stinnere5014be2020-04-14 17:52:15 +020046#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinner91698d82020-06-25 14:07:40 +020047#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
Victor Stinnere5014be2020-04-14 17:52:15 +020050#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner91698d82020-06-25 14:07:40 +020051#include "ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinnerfecc4f22019-03-19 14:20:29 +010058/* Uncomment to display statistics on interned strings at exit when
59 using Valgrind or Insecure++. */
60/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 }
844 return str;
845}
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* --- Bloom Filters ----------------------------------------------------- */
848
849/* stuff to implement simple "bloom filters" for Unicode characters.
850 to keep things simple, we use a single bitmask, using the least 5
851 bits from each unicode characters as the bit index. */
852
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200853/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854
Antoine Pitrouf068f942010-01-13 14:19:12 +0000855#if LONG_BIT >= 128
856#define BLOOM_WIDTH 128
857#elif LONG_BIT >= 64
858#define BLOOM_WIDTH 64
859#elif LONG_BIT >= 32
860#define BLOOM_WIDTH 32
861#else
862#error "LONG_BIT is smaller than 32"
863#endif
864
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865#define BLOOM_MASK unsigned long
866
Serhiy Storchaka05997252013-01-26 12:14:02 +0200867static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870
Benjamin Peterson29060642009-01-31 22:14:21 +0000871#define BLOOM_LINEBREAK(ch) \
872 ((ch) < 128U ? ascii_linebreak[(ch)] : \
873 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700875static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300876make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877{
Victor Stinnera85af502013-04-09 21:53:54 +0200878#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
879 do { \
880 TYPE *data = (TYPE *)PTR; \
881 TYPE *end = data + LEN; \
882 Py_UCS4 ch; \
883 for (; data != end; data++) { \
884 ch = *data; \
885 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
886 } \
887 break; \
888 } while (0)
889
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 /* calculate simple bloom-style bitmask for a given unicode string */
891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
894 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200895 switch (kind) {
896 case PyUnicode_1BYTE_KIND:
897 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
898 break;
899 case PyUnicode_2BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
901 break;
902 case PyUnicode_4BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
904 break;
905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700906 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200909
910#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911}
912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300913static int
914ensure_unicode(PyObject *obj)
915{
916 if (!PyUnicode_Check(obj)) {
917 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200918 "must be str, not %.100s",
919 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920 return -1;
921 }
922 return PyUnicode_READY(obj);
923}
924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925/* Compilation of templated routines */
926
Victor Stinner90ed8a62020-06-24 00:34:07 +0200927#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/asciilib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs1lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
948#include "stringlib/ucs2lib.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/partition.h"
951#include "stringlib/split.h"
952#include "stringlib/count.h"
953#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300954#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200956#include "stringlib/undef.h"
957
958#include "stringlib/ucs4lib.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/partition.h"
961#include "stringlib/split.h"
962#include "stringlib/count.h"
963#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300964#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200965#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200966#include "stringlib/undef.h"
967
Inada Naoki2c4928d2020-06-17 20:09:44 +0900968_Py_COMP_DIAG_PUSH
969_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970#include "stringlib/unicodedefs.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/count.h"
973#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100974#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900975_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200977#undef STRINGLIB_GET_EMPTY
978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001227 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001228 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 }
1230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001231 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001232 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001233 return (PyUnicodeObject *)PyErr_NoMemory();
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 if (length < 0) {
1236 PyErr_SetString(PyExc_SystemError,
1237 "Negative size passed to _PyUnicode_New");
1238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1242 if (unicode == NULL)
1243 return NULL;
1244 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001245
1246 _PyUnicode_WSTR_LENGTH(unicode) = length;
1247 _PyUnicode_HASH(unicode) = -1;
1248 _PyUnicode_STATE(unicode).interned = 0;
1249 _PyUnicode_STATE(unicode).kind = 0;
1250 _PyUnicode_STATE(unicode).compact = 0;
1251 _PyUnicode_STATE(unicode).ready = 0;
1252 _PyUnicode_STATE(unicode).ascii = 0;
1253 _PyUnicode_DATA_ANY(unicode) = NULL;
1254 _PyUnicode_LENGTH(unicode) = 0;
1255 _PyUnicode_UTF8(unicode) = NULL;
1256 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1259 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001260 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001261 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001262 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264
Jeremy Hyltond8082792003-09-16 19:41:39 +00001265 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001266 * the caller fails before initializing str -- unicode_resize()
1267 * reads str[0], and the Keep-Alive optimization can keep memory
1268 * allocated for str alive across a call to unicode_dealloc(unicode).
1269 * We don't want unicode_resize to read uninitialized memory in
1270 * that case.
1271 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode)[0] = 0;
1273 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001274
Victor Stinner7931d9a2011-11-04 00:22:48 +01001275 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 return unicode;
1277}
1278
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279static const char*
1280unicode_kind_name(PyObject *unicode)
1281{
Victor Stinner42dfd712011-10-03 14:41:45 +02001282 /* don't check consistency: unicode_kind_name() is called from
1283 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 if (!PyUnicode_IS_COMPACT(unicode))
1285 {
1286 if (!PyUnicode_IS_READY(unicode))
1287 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 {
1290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "legacy ascii";
1293 else
1294 return "legacy latin1";
1295 case PyUnicode_2BYTE_KIND:
1296 return "legacy UCS2";
1297 case PyUnicode_4BYTE_KIND:
1298 return "legacy UCS4";
1299 default:
1300 return "<legacy invalid kind>";
1301 }
1302 }
1303 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001304 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001306 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 return "ascii";
1308 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 default:
1315 return "<invalid compact kind>";
1316 }
1317}
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001321const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001322 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return _PyUnicode_COMPACT_DATA(unicode);
1329}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001332 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1334 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1335 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1336 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1337 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1338 return PyUnicode_DATA(unicode);
1339}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001340
1341void
1342_PyUnicode_Dump(PyObject *op)
1343{
1344 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1346 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001347 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001350 {
1351 if (ascii->state.ascii)
1352 data = (ascii + 1);
1353 else
1354 data = (compact + 1);
1355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 else
1357 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001358 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->wstr == data)
1361 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001362 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera3b334d2011-10-03 13:53:37 +02001364 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001365 printf(" (%zu), ", compact->wstr_length);
1366 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 }
1369 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001370 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
1374
Victor Stinner91698d82020-06-25 14:07:40 +02001375static int
1376unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1377{
1378 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1379 // optimized to always use state->empty_string without having to check if
1380 // it is NULL or not.
1381 PyObject *empty = PyUnicode_New(1, 0);
1382 if (empty == NULL) {
1383 return -1;
1384 }
1385 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1386 _PyUnicode_LENGTH(empty) = 0;
1387 assert(_PyUnicode_CheckConsistency(empty, 1));
1388
1389 assert(state->empty_string == NULL);
1390 state->empty_string = empty;
1391 return 0;
1392}
1393
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyObject *
1396PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1397{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001398 /* Optimization for empty strings */
1399 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001400 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 }
1402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *obj;
1404 PyCompactUnicodeObject *unicode;
1405 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001406 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001407 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 Py_ssize_t char_size;
1409 Py_ssize_t struct_size;
1410
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 is_ascii = 0;
1412 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 struct_size = sizeof(PyCompactUnicodeObject);
1414 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001415 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 char_size = 1;
1417 is_ascii = 1;
1418 struct_size = sizeof(PyASCIIObject);
1419 }
1420 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001421 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 char_size = 1;
1423 }
1424 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 2;
1427 if (sizeof(wchar_t) == 2)
1428 is_sharing = 1;
1429 }
1430 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001431 if (maxchar > MAX_UNICODE) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "invalid maximum character passed to PyUnicode_New");
1434 return NULL;
1435 }
Victor Stinner8f825062012-04-27 13:55:39 +02001436 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 char_size = 4;
1438 if (sizeof(wchar_t) == 4)
1439 is_sharing = 1;
1440 }
1441
1442 /* Ensure we won't overflow the size. */
1443 if (size < 0) {
1444 PyErr_SetString(PyExc_SystemError,
1445 "Negative size passed to PyUnicode_New");
1446 return NULL;
1447 }
1448 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1449 return PyErr_NoMemory();
1450
1451 /* Duplicated allocation code from _PyObject_New() instead of a call to
1452 * PyObject_New() so we are able to allocate space for the object and
1453 * it's data buffer.
1454 */
1455 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001456 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001458 }
1459 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 unicode = (PyCompactUnicodeObject *)obj;
1462 if (is_ascii)
1463 data = ((PyASCIIObject*)obj) + 1;
1464 else
1465 data = unicode + 1;
1466 _PyUnicode_LENGTH(unicode) = size;
1467 _PyUnicode_HASH(unicode) = -1;
1468 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 _PyUnicode_STATE(unicode).compact = 1;
1471 _PyUnicode_STATE(unicode).ready = 1;
1472 _PyUnicode_STATE(unicode).ascii = is_ascii;
1473 if (is_ascii) {
1474 ((char*)data)[size] = 0;
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 }
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001482 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else {
1485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001489 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 ((Py_UCS4*)data)[size] = 0;
1491 if (is_sharing) {
1492 _PyUnicode_WSTR_LENGTH(unicode) = size;
1493 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1494 }
1495 else {
1496 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 }
1499 }
Victor Stinner8f825062012-04-27 13:55:39 +02001500#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001501 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001502#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return obj;
1505}
1506
1507#if SIZEOF_WCHAR_T == 2
1508/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1509 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001510 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 This function assumes that unicode can hold one more code point than wstr
1513 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001514static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
1518 const wchar_t *iter;
1519 Py_UCS4 *ucs4_out;
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 assert(unicode != NULL);
1522 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1524 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1525
1526 for (iter = begin; iter < end; ) {
1527 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1528 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001529 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1530 && (iter+1) < end
1531 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 {
Victor Stinner551ac952011-11-29 22:58:13 +01001533 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 iter += 2;
1535 }
1536 else {
1537 *ucs4_out++ = *iter;
1538 iter++;
1539 }
1540 }
1541 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
1543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544}
1545#endif
1546
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547static int
Victor Stinner488fa492011-12-12 00:01:39 +01001548unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001549{
Victor Stinner488fa492011-12-12 00:01:39 +01001550 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001551 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001552 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553 return -1;
1554 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001555 return 0;
1556}
1557
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558static int
1559_copy_characters(PyObject *to, Py_ssize_t to_start,
1560 PyObject *from, Py_ssize_t from_start,
1561 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001563 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001564 const void *from_data;
1565 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinneree4544c2012-05-09 22:24:08 +02001567 assert(0 <= how_many);
1568 assert(0 <= from_start);
1569 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerd3f08822012-05-29 12:57:52 +02001574 assert(PyUnicode_Check(to));
1575 assert(PyUnicode_IS_READY(to));
1576 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1577
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 if (how_many == 0)
1579 return 0;
1580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585
Victor Stinnerf1852262012-06-16 16:38:26 +02001586#ifdef Py_DEBUG
1587 if (!check_maxchar
1588 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1589 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 Py_UCS4 ch;
1592 Py_ssize_t i;
1593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1595 assert(ch <= to_maxchar);
1596 }
1597 }
1598#endif
1599
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001600 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (check_maxchar
1602 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1603 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 /* Writing Latin-1 characters into an ASCII string requires to
1605 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 Py_UCS4 max_char;
1607 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001608 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 if (max_char >= 128)
1610 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001611 }
Christian Heimesf051e432016-09-13 20:22:02 +02001612 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001614 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001616 else if (from_kind == PyUnicode_1BYTE_KIND
1617 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS1, Py_UCS2,
1621 PyUnicode_1BYTE_DATA(from) + from_start,
1622 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_2BYTE_DATA(to) + to_start
1624 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001625 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001626 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 && to_kind == PyUnicode_4BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS1, Py_UCS4,
1631 PyUnicode_1BYTE_DATA(from) + from_start,
1632 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_4BYTE_DATA(to) + to_start
1634 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001635 }
1636 else if (from_kind == PyUnicode_2BYTE_KIND
1637 && to_kind == PyUnicode_4BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS2, Py_UCS4,
1641 PyUnicode_2BYTE_DATA(from) + from_start,
1642 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_4BYTE_DATA(to) + to_start
1644 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001645 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1648
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 if (!check_maxchar) {
1650 if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_1BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS1,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_1BYTE_DATA(to) + to_start
1658 );
1659 }
1660 else if (from_kind == PyUnicode_4BYTE_KIND
1661 && to_kind == PyUnicode_1BYTE_KIND)
1662 {
1663 _PyUnicode_CONVERT_BYTES(
1664 Py_UCS4, Py_UCS1,
1665 PyUnicode_4BYTE_DATA(from) + from_start,
1666 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1667 PyUnicode_1BYTE_DATA(to) + to_start
1668 );
1669 }
1670 else if (from_kind == PyUnicode_4BYTE_KIND
1671 && to_kind == PyUnicode_2BYTE_KIND)
1672 {
1673 _PyUnicode_CONVERT_BYTES(
1674 Py_UCS4, Py_UCS2,
1675 PyUnicode_4BYTE_DATA(from) + from_start,
1676 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1677 PyUnicode_2BYTE_DATA(to) + to_start
1678 );
1679 }
1680 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001681 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001682 }
1683 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001684 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 Py_ssize_t i;
1688
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 for (i=0; i < how_many; i++) {
1690 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001691 if (ch > to_maxchar)
1692 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1694 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001695 }
1696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 return 0;
1698}
1699
Victor Stinnerd3f08822012-05-29 12:57:52 +02001700void
1701_PyUnicode_FastCopyCharacters(
1702 PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704{
1705 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1706}
1707
1708Py_ssize_t
1709PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1710 PyObject *from, Py_ssize_t from_start,
1711 Py_ssize_t how_many)
1712{
1713 int err;
1714
1715 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1716 PyErr_BadInternalCall();
1717 return -1;
1718 }
1719
Benjamin Petersonbac79492012-01-14 13:34:47 -05001720 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001722 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001723 return -1;
1724
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001726 PyErr_SetString(PyExc_IndexError, "string index out of range");
1727 return -1;
1728 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if (how_many < 0) {
1734 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1735 return -1;
1736 }
1737 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1739 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001740 "Cannot write %zi characters at %zi "
1741 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 how_many, to_start, PyUnicode_GET_LENGTH(to));
1743 return -1;
1744 }
1745
1746 if (how_many == 0)
1747 return 0;
1748
Victor Stinner488fa492011-12-12 00:01:39 +01001749 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001750 return -1;
1751
1752 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1753 if (err) {
1754 PyErr_Format(PyExc_SystemError,
1755 "Cannot copy %s characters "
1756 "into a string of %s characters",
1757 unicode_kind_name(from),
1758 unicode_kind_name(to));
1759 return -1;
1760 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001761 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Victor Stinner17222162011-09-28 22:15:37 +02001764/* Find the maximum code point and count the number of surrogate pairs so a
1765 correct string length can be computed before converting a string to UCS4.
1766 This function counts single surrogates as a character and not as a pair.
1767
1768 Return 0 on success, or -1 on error. */
1769static int
1770find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1771 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772{
1773 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 *num_surrogates = 0;
1778 *maxchar = 0;
1779
1780 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001782 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1783 && (iter+1) < end
1784 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1785 {
1786 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1787 ++(*num_surrogates);
1788 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 {
1793 ch = *iter;
1794 iter++;
1795 }
1796 if (ch > *maxchar) {
1797 *maxchar = ch;
1798 if (*maxchar > MAX_UNICODE) {
1799 PyErr_Format(PyExc_ValueError,
1800 "character U+%x is not in range [U+0000; U+10ffff]",
1801 ch);
1802 return -1;
1803 }
1804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 return 0;
1807}
1808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809int
1810_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811{
1812 wchar_t *end;
1813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815#if SIZEOF_WCHAR_T == 2
1816 Py_ssize_t length_wo_surrogates;
1817#endif
1818
Georg Brandl7597add2011-10-05 16:36:47 +02001819 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 strings were created using _PyObject_New() and where no canonical
1821 representation (the str field) has been set yet aka strings
1822 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001823 assert(_PyUnicode_CHECK(unicode));
1824 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001827 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001828 /* Actually, it should neither be interned nor be anything else: */
1829 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001832 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
1836 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1838 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 PyErr_NoMemory();
1840 return -1;
1841 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001842 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 _PyUnicode_WSTR(unicode), end,
1844 PyUnicode_1BYTE_DATA(unicode));
1845 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1847 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1848 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001849 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001850 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001851 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 }
1853 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861 }
1862 /* In this case we might have to convert down from 4-byte native
1863 wchar_t to 2-byte unicode. */
1864 else if (maxchar < 65536) {
1865 assert(num_surrogates == 0 &&
1866 "FindMaxCharAndNumSurrogatePairs() messed up");
1867
Victor Stinner506f5922011-09-28 22:34:18 +02001868#if SIZEOF_WCHAR_T == 2
1869 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876#else
1877 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001878 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001879 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001881 PyErr_NoMemory();
1882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinner506f5922011-09-28 22:34:18 +02001884 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1885 _PyUnicode_WSTR(unicode), end,
1886 PyUnicode_2BYTE_DATA(unicode));
1887 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1888 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1889 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001890 _PyUnicode_UTF8(unicode) = NULL;
1891 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001892 PyObject_FREE(_PyUnicode_WSTR(unicode));
1893 _PyUnicode_WSTR(unicode) = NULL;
1894 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
1897 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1898 else {
1899#if SIZEOF_WCHAR_T == 2
1900 /* in case the native representation is 2-bytes, we need to allocate a
1901 new normalized 4-byte version. */
1902 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001903 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1904 PyErr_NoMemory();
1905 return -1;
1906 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1908 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 PyErr_NoMemory();
1910 return -1;
1911 }
1912 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001916 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001918 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 PyObject_FREE(_PyUnicode_WSTR(unicode));
1920 _PyUnicode_WSTR(unicode) = NULL;
1921 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1922#else
1923 assert(num_surrogates == 0);
1924
Victor Stinnerc3c74152011-10-02 20:39:55 +02001925 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001927 _PyUnicode_UTF8(unicode) = NULL;
1928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1930#endif
1931 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1932 }
1933 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return 0;
1936}
1937
Alexander Belopolsky40018472011-02-26 01:02:56 +00001938static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001939unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940{
Walter Dörwald16807132007-05-25 13:52:07 +00001941 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 case SSTATE_NOT_INTERNED:
1943 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_INTERNED_MORTAL:
1946 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001947 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001948#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001949 if (PyDict_DelItem(interned, unicode) != 0) {
1950 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1951 NULL);
1952 }
Victor Stinner607b1022020-05-05 18:50:30 +02001953#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001954 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1958 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001959
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001962 }
1963
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001966 }
1967 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001968 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 }
1970 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001971 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001974 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975}
1976
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977#ifdef Py_DEBUG
1978static int
1979unicode_is_singleton(PyObject *unicode)
1980{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001981 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001982 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001983 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001984 }
Victor Stinner607b1022020-05-05 18:50:30 +02001985 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001986 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1987 {
1988 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001989 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001991 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001992 }
1993 return 0;
1994}
1995#endif
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997static int
Victor Stinner488fa492011-12-12 00:01:39 +01001998unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999{
Victor Stinner488fa492011-12-12 00:01:39 +01002000 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 if (Py_REFCNT(unicode) != 1)
2002 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002003 if (_PyUnicode_HASH(unicode) != -1)
2004 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 if (PyUnicode_CHECK_INTERNED(unicode))
2006 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002007 if (!PyUnicode_CheckExact(unicode))
2008 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002009#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002010 /* singleton refcount is greater than 1 */
2011 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002012#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 return 1;
2014}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002015
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016static int
2017unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2018{
2019 PyObject *unicode;
2020 Py_ssize_t old_length;
2021
2022 assert(p_unicode != NULL);
2023 unicode = *p_unicode;
2024
2025 assert(unicode != NULL);
2026 assert(PyUnicode_Check(unicode));
2027 assert(0 <= length);
2028
Victor Stinner910337b2011-10-03 03:20:16 +02002029 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002030 old_length = PyUnicode_WSTR_LENGTH(unicode);
2031 else
2032 old_length = PyUnicode_GET_LENGTH(unicode);
2033 if (old_length == length)
2034 return 0;
2035
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002036 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002037 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002038 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002039 return 0;
2040 }
2041
Victor Stinner488fa492011-12-12 00:01:39 +01002042 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 PyObject *copy = resize_copy(unicode, length);
2044 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002046 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048 }
2049
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002051 PyObject *new_unicode = resize_compact(unicode, length);
2052 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002054 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002057 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002058}
2059
Alexander Belopolsky40018472011-02-26 01:02:56 +00002060int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002061PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002062{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 PyObject *unicode;
2064 if (p_unicode == NULL) {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002069 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 {
2071 PyErr_BadInternalCall();
2072 return -1;
2073 }
2074 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002075}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002076
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002077/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002078
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002079 WARNING: The function doesn't copy the terminating null character and
2080 doesn't check the maximum character (may write a latin1 character in an
2081 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002082static void
2083unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2084 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002085{
2086 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002087 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002090 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002091 switch (kind) {
2092 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002093#ifdef Py_DEBUG
2094 if (PyUnicode_IS_ASCII(unicode)) {
2095 Py_UCS4 maxchar = ucs1lib_find_max_char(
2096 (const Py_UCS1*)str,
2097 (const Py_UCS1*)str + len);
2098 assert(maxchar < 128);
2099 }
2100#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002101 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002102 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 }
2104 case PyUnicode_2BYTE_KIND: {
2105 Py_UCS2 *start = (Py_UCS2 *)data + index;
2106 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107
Victor Stinner184252a2012-06-16 02:57:41 +02002108 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 *ucs2 = (Py_UCS2)*str;
2110
2111 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002114 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 Py_UCS4 *start = (Py_UCS4 *)data + index;
2116 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117
Victor Stinner184252a2012-06-16 02:57:41 +02002118 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 *ucs4 = (Py_UCS4)*str;
2120
2121 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002122 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002124 default:
2125 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 }
2127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002130get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002132 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002133
Victor Stinner2f9ada92020-06-24 02:22:21 +02002134 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002135 if (unicode) {
2136 Py_INCREF(unicode);
2137 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 }
Victor Stinner607b1022020-05-05 18:50:30 +02002139
2140 unicode = PyUnicode_New(1, ch);
2141 if (!unicode) {
2142 return NULL;
2143 }
2144
2145 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2146 assert(_PyUnicode_CheckConsistency(unicode, 1));
2147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002149 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002150 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151}
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153static PyObject*
2154unicode_char(Py_UCS4 ch)
2155{
2156 PyObject *unicode;
2157
2158 assert(ch <= MAX_UNICODE);
2159
Victor Stinner2f9ada92020-06-24 02:22:21 +02002160 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002161 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002162 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002163
Victor Stinner985a82a2014-01-03 12:53:47 +01002164 unicode = PyUnicode_New(1, ch);
2165 if (unicode == NULL)
2166 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002167
2168 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2169 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002170 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002171 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002172 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2173 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2174 }
2175 assert(_PyUnicode_CheckConsistency(unicode, 1));
2176 return unicode;
2177}
2178
Alexander Belopolsky40018472011-02-26 01:02:56 +00002179PyObject *
2180PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002182 if (u == NULL)
2183 return (PyObject*)_PyUnicode_New(size);
2184
2185 if (size < 0) {
2186 PyErr_BadInternalCall();
2187 return NULL;
2188 }
2189
2190 return PyUnicode_FromWideChar(u, size);
2191}
2192
2193PyObject *
2194PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2195{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002196 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 Py_UCS4 maxchar = 0;
2198 Py_ssize_t num_surrogates;
2199
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002200 if (u == NULL && size != 0) {
2201 PyErr_BadInternalCall();
2202 return NULL;
2203 }
2204
2205 if (size == -1) {
2206 size = wcslen(u);
2207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002209 /* If the Unicode data is known at construction time, we can apply
2210 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002213 if (size == 0)
2214 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 /* Single character Unicode objects in the Latin-1 range are
2217 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002218 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 return get_latin1_char((unsigned char)*u);
2220
2221 /* If not empty and not single character, copy the Unicode data
2222 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002223 if (find_maxchar_surrogates(u, u + size,
2224 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return NULL;
2226
Victor Stinner8faf8212011-12-08 22:14:11 +01002227 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 if (!unicode)
2229 return NULL;
2230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 switch (PyUnicode_KIND(unicode)) {
2232 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002233 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2235 break;
2236 case PyUnicode_2BYTE_KIND:
2237#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002238 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002240 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2242#endif
2243 break;
2244 case PyUnicode_4BYTE_KIND:
2245#if SIZEOF_WCHAR_T == 2
2246 /* This is the only case which has to process surrogates, thus
2247 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002248 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249#else
2250 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002251 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252#endif
2253 break;
2254 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002255 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002258 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259}
2260
Alexander Belopolsky40018472011-02-26 01:02:56 +00002261PyObject *
2262PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002263{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002264 if (size < 0) {
2265 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002266 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 return NULL;
2268 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002269 if (u != NULL)
2270 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2271 else
2272 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002273}
2274
Alexander Belopolsky40018472011-02-26 01:02:56 +00002275PyObject *
2276PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002277{
2278 size_t size = strlen(u);
2279 if (size > PY_SSIZE_T_MAX) {
2280 PyErr_SetString(PyExc_OverflowError, "input too long");
2281 return NULL;
2282 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002283 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002284}
2285
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002286PyObject *
2287_PyUnicode_FromId(_Py_Identifier *id)
2288{
Victor Stinner297257f2020-06-02 14:39:45 +02002289 if (id->object) {
2290 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002291 }
Victor Stinner297257f2020-06-02 14:39:45 +02002292
2293 PyObject *obj;
2294 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2295 strlen(id->string),
2296 NULL, NULL);
2297 if (!obj) {
2298 return NULL;
2299 }
2300 PyUnicode_InternInPlace(&obj);
2301
2302 assert(!id->next);
2303 id->object = obj;
2304 id->next = static_strings;
2305 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002306 return id->object;
2307}
2308
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002309static void
2310unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002311{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002312 _Py_Identifier *tmp, *s = static_strings;
2313 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002314 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002315 tmp = s->next;
2316 s->next = NULL;
2317 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002318 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002319 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002320}
2321
Benjamin Peterson0df54292012-03-26 14:50:32 -04002322/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002323
Victor Stinnerd3f08822012-05-29 12:57:52 +02002324PyObject*
2325_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002326{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002327 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002328 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002329 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002330#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002331 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002332#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002333 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002334 }
Victor Stinner785938e2011-12-11 20:09:03 +01002335 unicode = PyUnicode_New(size, 127);
2336 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002337 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002338 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2339 assert(_PyUnicode_CheckConsistency(unicode, 1));
2340 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002341}
2342
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002343static Py_UCS4
2344kind_maxchar_limit(unsigned int kind)
2345{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002346 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002347 case PyUnicode_1BYTE_KIND:
2348 return 0x80;
2349 case PyUnicode_2BYTE_KIND:
2350 return 0x100;
2351 case PyUnicode_4BYTE_KIND:
2352 return 0x10000;
2353 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002354 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002355 }
2356}
2357
Victor Stinner702c7342011-10-05 13:50:52 +02002358static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002359_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002362 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002363
Victor Stinner2f9ada92020-06-24 02:22:21 +02002364 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002365 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002366 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002367 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002368 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002369 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002370 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002371
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002372 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002373 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 if (!res)
2375 return NULL;
2376 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002377 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002379}
2380
Victor Stinnere57b1c02011-09-28 22:20:48 +02002381static PyObject*
2382_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383{
2384 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002385 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002386
Serhiy Storchaka678db842013-01-26 12:16:36 +02002387 if (size == 0)
2388 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002389 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002390 if (size == 1)
2391 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002392
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002393 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002394 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 if (!res)
2396 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002399 else {
2400 _PyUnicode_CONVERT_BYTES(
2401 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002403 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 return res;
2405}
2406
Victor Stinnere57b1c02011-09-28 22:20:48 +02002407static PyObject*
2408_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409{
2410 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002411 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002412
Serhiy Storchaka678db842013-01-26 12:16:36 +02002413 if (size == 0)
2414 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002415 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002416 if (size == 1)
2417 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002418
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002419 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002420 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 if (!res)
2422 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002423 if (max_char < 256)
2424 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2425 PyUnicode_1BYTE_DATA(res));
2426 else if (max_char < 0x10000)
2427 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2428 PyUnicode_2BYTE_DATA(res));
2429 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002431 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 return res;
2433}
2434
2435PyObject*
2436PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2437{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002438 if (size < 0) {
2439 PyErr_SetString(PyExc_ValueError, "size must be positive");
2440 return NULL;
2441 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002442 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002444 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002446 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002448 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002449 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002450 PyErr_SetString(PyExc_SystemError, "invalid kind");
2451 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453}
2454
Victor Stinnerece58de2012-04-23 23:36:38 +02002455Py_UCS4
2456_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2457{
2458 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002459 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002460
2461 assert(PyUnicode_IS_READY(unicode));
2462 assert(0 <= start);
2463 assert(end <= PyUnicode_GET_LENGTH(unicode));
2464 assert(start <= end);
2465
2466 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2467 return PyUnicode_MAX_CHAR_VALUE(unicode);
2468
2469 if (start == end)
2470 return 127;
2471
Victor Stinner94d558b2012-04-27 22:26:58 +02002472 if (PyUnicode_IS_ASCII(unicode))
2473 return 127;
2474
Victor Stinnerece58de2012-04-23 23:36:38 +02002475 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002476 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002477 endptr = (char *)startptr + end * kind;
2478 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002479 switch(kind) {
2480 case PyUnicode_1BYTE_KIND:
2481 return ucs1lib_find_max_char(startptr, endptr);
2482 case PyUnicode_2BYTE_KIND:
2483 return ucs2lib_find_max_char(startptr, endptr);
2484 case PyUnicode_4BYTE_KIND:
2485 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002486 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002487 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002488 }
2489}
2490
Victor Stinner25a4b292011-10-06 12:31:55 +02002491/* Ensure that a string uses the most efficient storage, if it is not the
2492 case: create a new string with of the right kind. Write NULL into *p_unicode
2493 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002494static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002495unicode_adjust_maxchar(PyObject **p_unicode)
2496{
2497 PyObject *unicode, *copy;
2498 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002499 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002500 unsigned int kind;
2501
2502 assert(p_unicode != NULL);
2503 unicode = *p_unicode;
2504 assert(PyUnicode_IS_READY(unicode));
2505 if (PyUnicode_IS_ASCII(unicode))
2506 return;
2507
2508 len = PyUnicode_GET_LENGTH(unicode);
2509 kind = PyUnicode_KIND(unicode);
2510 if (kind == PyUnicode_1BYTE_KIND) {
2511 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002512 max_char = ucs1lib_find_max_char(u, u + len);
2513 if (max_char >= 128)
2514 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002515 }
2516 else if (kind == PyUnicode_2BYTE_KIND) {
2517 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002518 max_char = ucs2lib_find_max_char(u, u + len);
2519 if (max_char >= 256)
2520 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002521 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002522 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002523 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002524 max_char = ucs4lib_find_max_char(u, u + len);
2525 if (max_char >= 0x10000)
2526 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002527 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002528 else
2529 Py_UNREACHABLE();
2530
Victor Stinner25a4b292011-10-06 12:31:55 +02002531 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002532 if (copy != NULL)
2533 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002534 Py_DECREF(unicode);
2535 *p_unicode = copy;
2536}
2537
Victor Stinner034f6cf2011-09-30 02:26:44 +02002538PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002539_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002540{
Victor Stinner87af4f22011-11-21 23:03:47 +01002541 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002542 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002543
Victor Stinner034f6cf2011-09-30 02:26:44 +02002544 if (!PyUnicode_Check(unicode)) {
2545 PyErr_BadInternalCall();
2546 return NULL;
2547 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002548 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002549 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002550
Victor Stinner87af4f22011-11-21 23:03:47 +01002551 length = PyUnicode_GET_LENGTH(unicode);
2552 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002553 if (!copy)
2554 return NULL;
2555 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2556
Christian Heimesf051e432016-09-13 20:22:02 +02002557 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002558 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002559 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002560 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002561}
2562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563
Victor Stinnerbc603d12011-10-02 01:00:40 +02002564/* Widen Unicode objects to larger buffers. Don't write terminating null
2565 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002567static void*
2568unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002570 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002571
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002572 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002573 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002574 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002575 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002576 if (!result)
2577 return PyErr_NoMemory();
2578 assert(skind == PyUnicode_1BYTE_KIND);
2579 _PyUnicode_CONVERT_BYTES(
2580 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002581 (const Py_UCS1 *)data,
2582 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002583 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002585 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002586 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002587 if (!result)
2588 return PyErr_NoMemory();
2589 if (skind == PyUnicode_2BYTE_KIND) {
2590 _PyUnicode_CONVERT_BYTES(
2591 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002592 (const Py_UCS2 *)data,
2593 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002594 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002596 else {
2597 assert(skind == PyUnicode_1BYTE_KIND);
2598 _PyUnicode_CONVERT_BYTES(
2599 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002600 (const Py_UCS1 *)data,
2601 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002602 result);
2603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002605 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002606 Py_UNREACHABLE();
2607 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609}
2610
2611static Py_UCS4*
2612as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2613 int copy_null)
2614{
2615 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002616 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 Py_ssize_t len, targetlen;
2618 if (PyUnicode_READY(string) == -1)
2619 return NULL;
2620 kind = PyUnicode_KIND(string);
2621 data = PyUnicode_DATA(string);
2622 len = PyUnicode_GET_LENGTH(string);
2623 targetlen = len;
2624 if (copy_null)
2625 targetlen++;
2626 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002627 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 if (!target) {
2629 PyErr_NoMemory();
2630 return NULL;
2631 }
2632 }
2633 else {
2634 if (targetsize < targetlen) {
2635 PyErr_Format(PyExc_SystemError,
2636 "string is longer than the buffer");
2637 if (copy_null && 0 < targetsize)
2638 target[0] = 0;
2639 return NULL;
2640 }
2641 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002642 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002643 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002644 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002646 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002647 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002648 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2649 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002650 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002651 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002652 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002653 else {
2654 Py_UNREACHABLE();
2655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 if (copy_null)
2657 target[len] = 0;
2658 return target;
2659}
2660
2661Py_UCS4*
2662PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2663 int copy_null)
2664{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002665 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 PyErr_BadInternalCall();
2667 return NULL;
2668 }
2669 return as_ucs4(string, target, targetsize, copy_null);
2670}
2671
2672Py_UCS4*
2673PyUnicode_AsUCS4Copy(PyObject *string)
2674{
2675 return as_ucs4(string, NULL, 0, 1);
2676}
2677
Victor Stinner15a11362012-10-06 23:48:20 +02002678/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002679 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2680 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2681#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002682
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002683static int
2684unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2685 Py_ssize_t width, Py_ssize_t precision)
2686{
2687 Py_ssize_t length, fill, arglen;
2688 Py_UCS4 maxchar;
2689
2690 if (PyUnicode_READY(str) == -1)
2691 return -1;
2692
2693 length = PyUnicode_GET_LENGTH(str);
2694 if ((precision == -1 || precision >= length)
2695 && width <= length)
2696 return _PyUnicodeWriter_WriteStr(writer, str);
2697
2698 if (precision != -1)
2699 length = Py_MIN(precision, length);
2700
2701 arglen = Py_MAX(length, width);
2702 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2703 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2704 else
2705 maxchar = writer->maxchar;
2706
2707 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2708 return -1;
2709
2710 if (width > length) {
2711 fill = width - length;
2712 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2713 return -1;
2714 writer->pos += fill;
2715 }
2716
2717 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2718 str, 0, length);
2719 writer->pos += length;
2720 return 0;
2721}
2722
2723static int
Victor Stinner998b8062018-09-12 00:23:25 +02002724unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002725 Py_ssize_t width, Py_ssize_t precision)
2726{
2727 /* UTF-8 */
2728 Py_ssize_t length;
2729 PyObject *unicode;
2730 int res;
2731
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002732 if (precision == -1) {
2733 length = strlen(str);
2734 }
2735 else {
2736 length = 0;
2737 while (length < precision && str[length]) {
2738 length++;
2739 }
2740 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002741 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2742 if (unicode == NULL)
2743 return -1;
2744
2745 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2746 Py_DECREF(unicode);
2747 return res;
2748}
2749
Victor Stinner96865452011-03-01 23:44:09 +00002750static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002751unicode_fromformat_arg(_PyUnicodeWriter *writer,
2752 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002753{
Victor Stinnere215d962012-10-06 23:03:36 +02002754 const char *p;
2755 Py_ssize_t len;
2756 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757 Py_ssize_t width;
2758 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002759 int longflag;
2760 int longlongflag;
2761 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002762 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002763
2764 p = f;
2765 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002766 zeropad = 0;
2767 if (*f == '0') {
2768 zeropad = 1;
2769 f++;
2770 }
Victor Stinner96865452011-03-01 23:44:09 +00002771
2772 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002773 width = -1;
2774 if (Py_ISDIGIT((unsigned)*f)) {
2775 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002776 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002777 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002778 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002779 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002781 return NULL;
2782 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002783 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002784 f++;
2785 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 }
2787 precision = -1;
2788 if (*f == '.') {
2789 f++;
2790 if (Py_ISDIGIT((unsigned)*f)) {
2791 precision = (*f - '0');
2792 f++;
2793 while (Py_ISDIGIT((unsigned)*f)) {
2794 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2795 PyErr_SetString(PyExc_ValueError,
2796 "precision too big");
2797 return NULL;
2798 }
2799 precision = (precision * 10) + (*f - '0');
2800 f++;
2801 }
2802 }
Victor Stinner96865452011-03-01 23:44:09 +00002803 if (*f == '%') {
2804 /* "%.3%s" => f points to "3" */
2805 f--;
2806 }
2807 }
2808 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002809 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002810 f--;
2811 }
Victor Stinner96865452011-03-01 23:44:09 +00002812
2813 /* Handle %ld, %lu, %lld and %llu. */
2814 longflag = 0;
2815 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002816 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002817 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002818 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002819 longflag = 1;
2820 ++f;
2821 }
Victor Stinner96865452011-03-01 23:44:09 +00002822 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002823 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002824 longlongflag = 1;
2825 f += 2;
2826 }
Victor Stinner96865452011-03-01 23:44:09 +00002827 }
2828 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002829 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002830 size_tflag = 1;
2831 ++f;
2832 }
Victor Stinnere215d962012-10-06 23:03:36 +02002833
2834 if (f[1] == '\0')
2835 writer->overallocate = 0;
2836
2837 switch (*f) {
2838 case 'c':
2839 {
2840 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002841 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002842 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002843 "character argument not in range(0x110000)");
2844 return NULL;
2845 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002846 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849 }
2850
2851 case 'i':
2852 case 'd':
2853 case 'u':
2854 case 'x':
2855 {
2856 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002857 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002858 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002859
2860 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002861 if (longflag) {
2862 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2863 }
2864 else if (longlongflag) {
2865 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2866 }
2867 else if (size_tflag) {
2868 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2869 }
2870 else {
2871 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2872 }
Victor Stinnere215d962012-10-06 23:03:36 +02002873 }
2874 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002875 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002876 }
2877 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002878 if (longflag) {
2879 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2880 }
2881 else if (longlongflag) {
2882 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2883 }
2884 else if (size_tflag) {
2885 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2886 }
2887 else {
2888 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2889 }
Victor Stinnere215d962012-10-06 23:03:36 +02002890 }
2891 assert(len >= 0);
2892
Victor Stinnere215d962012-10-06 23:03:36 +02002893 if (precision < len)
2894 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002895
2896 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002897 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2898 return NULL;
2899
Victor Stinnere215d962012-10-06 23:03:36 +02002900 if (width > precision) {
2901 Py_UCS4 fillchar;
2902 fill = width - precision;
2903 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002904 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2905 return NULL;
2906 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002907 }
Victor Stinner15a11362012-10-06 23:48:20 +02002908 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002909 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002910 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2911 return NULL;
2912 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002913 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914
Victor Stinner4a587072013-11-19 12:54:53 +01002915 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2916 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 break;
2918 }
2919
2920 case 'p':
2921 {
2922 char number[MAX_LONG_LONG_CHARS];
2923
2924 len = sprintf(number, "%p", va_arg(*vargs, void*));
2925 assert(len >= 0);
2926
2927 /* %p is ill-defined: ensure leading 0x. */
2928 if (number[1] == 'X')
2929 number[1] = 'x';
2930 else if (number[1] != 'x') {
2931 memmove(number + 2, number,
2932 strlen(number) + 1);
2933 number[0] = '0';
2934 number[1] = 'x';
2935 len += 2;
2936 }
2937
Victor Stinner4a587072013-11-19 12:54:53 +01002938 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002939 return NULL;
2940 break;
2941 }
2942
2943 case 's':
2944 {
2945 /* UTF-8 */
2946 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002947 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002948 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002949 break;
2950 }
2951
2952 case 'U':
2953 {
2954 PyObject *obj = va_arg(*vargs, PyObject *);
2955 assert(obj && _PyUnicode_CHECK(obj));
2956
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002957 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002958 return NULL;
2959 break;
2960 }
2961
2962 case 'V':
2963 {
2964 PyObject *obj = va_arg(*vargs, PyObject *);
2965 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002966 if (obj) {
2967 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002969 return NULL;
2970 }
2971 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002972 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002973 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002974 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002975 }
2976 break;
2977 }
2978
2979 case 'S':
2980 {
2981 PyObject *obj = va_arg(*vargs, PyObject *);
2982 PyObject *str;
2983 assert(obj);
2984 str = PyObject_Str(obj);
2985 if (!str)
2986 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002987 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002988 Py_DECREF(str);
2989 return NULL;
2990 }
2991 Py_DECREF(str);
2992 break;
2993 }
2994
2995 case 'R':
2996 {
2997 PyObject *obj = va_arg(*vargs, PyObject *);
2998 PyObject *repr;
2999 assert(obj);
3000 repr = PyObject_Repr(obj);
3001 if (!repr)
3002 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003003 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003004 Py_DECREF(repr);
3005 return NULL;
3006 }
3007 Py_DECREF(repr);
3008 break;
3009 }
3010
3011 case 'A':
3012 {
3013 PyObject *obj = va_arg(*vargs, PyObject *);
3014 PyObject *ascii;
3015 assert(obj);
3016 ascii = PyObject_ASCII(obj);
3017 if (!ascii)
3018 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003019 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003020 Py_DECREF(ascii);
3021 return NULL;
3022 }
3023 Py_DECREF(ascii);
3024 break;
3025 }
3026
3027 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003028 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003029 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003030 break;
3031
3032 default:
3033 /* if we stumble upon an unknown formatting code, copy the rest
3034 of the format string to the output string. (we cannot just
3035 skip the code, since there's no way to know what's in the
3036 argument list) */
3037 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003038 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003039 return NULL;
3040 f = p+len;
3041 return f;
3042 }
3043
3044 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003045 return f;
3046}
3047
Walter Dörwaldd2034312007-05-18 16:29:38 +00003048PyObject *
3049PyUnicode_FromFormatV(const char *format, va_list vargs)
3050{
Victor Stinnere215d962012-10-06 23:03:36 +02003051 va_list vargs2;
3052 const char *f;
3053 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003054
Victor Stinner8f674cc2013-04-17 23:02:17 +02003055 _PyUnicodeWriter_Init(&writer);
3056 writer.min_length = strlen(format) + 100;
3057 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003058
Benjamin Peterson0c212142016-09-20 20:39:33 -07003059 // Copy varags to be able to pass a reference to a subfunction.
3060 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003061
3062 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003063 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003064 f = unicode_fromformat_arg(&writer, f, &vargs2);
3065 if (f == NULL)
3066 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003068 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003069 const char *p;
3070 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003071
Victor Stinnere215d962012-10-06 23:03:36 +02003072 p = f;
3073 do
3074 {
3075 if ((unsigned char)*p > 127) {
3076 PyErr_Format(PyExc_ValueError,
3077 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3078 "string, got a non-ASCII byte: 0x%02x",
3079 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003080 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003081 }
3082 p++;
3083 }
3084 while (*p != '\0' && *p != '%');
3085 len = p - f;
3086
3087 if (*p == '\0')
3088 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003089
3090 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003091 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003092
3093 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003095 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003096 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003097 return _PyUnicodeWriter_Finish(&writer);
3098
3099 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003100 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003101 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003102 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003103}
3104
Walter Dörwaldd2034312007-05-18 16:29:38 +00003105PyObject *
3106PyUnicode_FromFormat(const char *format, ...)
3107{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003108 PyObject* ret;
3109 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003110
3111#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003112 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003113#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003114 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003115#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003116 ret = PyUnicode_FromFormatV(format, vargs);
3117 va_end(vargs);
3118 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003119}
3120
Serhiy Storchakac46db922018-10-23 22:58:24 +03003121static Py_ssize_t
3122unicode_get_widechar_size(PyObject *unicode)
3123{
3124 Py_ssize_t res;
3125
3126 assert(unicode != NULL);
3127 assert(_PyUnicode_CHECK(unicode));
3128
3129 if (_PyUnicode_WSTR(unicode) != NULL) {
3130 return PyUnicode_WSTR_LENGTH(unicode);
3131 }
3132 assert(PyUnicode_IS_READY(unicode));
3133
3134 res = _PyUnicode_LENGTH(unicode);
3135#if SIZEOF_WCHAR_T == 2
3136 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3137 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3138 const Py_UCS4 *end = s + res;
3139 for (; s < end; ++s) {
3140 if (*s > 0xFFFF) {
3141 ++res;
3142 }
3143 }
3144 }
3145#endif
3146 return res;
3147}
3148
3149static void
3150unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3151{
3152 const wchar_t *wstr;
3153
3154 assert(unicode != NULL);
3155 assert(_PyUnicode_CHECK(unicode));
3156
3157 wstr = _PyUnicode_WSTR(unicode);
3158 if (wstr != NULL) {
3159 memcpy(w, wstr, size * sizeof(wchar_t));
3160 return;
3161 }
3162 assert(PyUnicode_IS_READY(unicode));
3163
3164 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3165 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3166 for (; size--; ++s, ++w) {
3167 *w = *s;
3168 }
3169 }
3170 else {
3171#if SIZEOF_WCHAR_T == 4
3172 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3173 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3174 for (; size--; ++s, ++w) {
3175 *w = *s;
3176 }
3177#else
3178 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3179 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3180 for (; size--; ++s, ++w) {
3181 Py_UCS4 ch = *s;
3182 if (ch > 0xFFFF) {
3183 assert(ch <= MAX_UNICODE);
3184 /* encode surrogate pair in this case */
3185 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3186 if (!size--)
3187 break;
3188 *w = Py_UNICODE_LOW_SURROGATE(ch);
3189 }
3190 else {
3191 *w = ch;
3192 }
3193 }
3194#endif
3195 }
3196}
3197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003198#ifdef HAVE_WCHAR_H
3199
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003200/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003201
Victor Stinnerd88d9832011-09-06 02:00:05 +02003202 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003203 character) required to convert the unicode object. Ignore size argument.
3204
Victor Stinnerd88d9832011-09-06 02:00:05 +02003205 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003206 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003207 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003208Py_ssize_t
3209PyUnicode_AsWideChar(PyObject *unicode,
3210 wchar_t *w,
3211 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003212{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003213 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003214
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003215 if (unicode == NULL) {
3216 PyErr_BadInternalCall();
3217 return -1;
3218 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003219 if (!PyUnicode_Check(unicode)) {
3220 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003222 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003223
3224 res = unicode_get_widechar_size(unicode);
3225 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003226 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003227 }
3228
3229 if (size > res) {
3230 size = res + 1;
3231 }
3232 else {
3233 res = size;
3234 }
3235 unicode_copy_as_widechar(unicode, w, size);
3236 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003237}
3238
Victor Stinner137c34c2010-09-29 10:25:54 +00003239wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003240PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003241 Py_ssize_t *size)
3242{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003243 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003244 Py_ssize_t buflen;
3245
3246 if (unicode == NULL) {
3247 PyErr_BadInternalCall();
3248 return NULL;
3249 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003252 return NULL;
3253 }
3254
Serhiy Storchakac46db922018-10-23 22:58:24 +03003255 buflen = unicode_get_widechar_size(unicode);
3256 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003257 if (buffer == NULL) {
3258 PyErr_NoMemory();
3259 return NULL;
3260 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003261 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3262 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003263 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003264 }
3265 else if (wcslen(buffer) != (size_t)buflen) {
3266 PyMem_FREE(buffer);
3267 PyErr_SetString(PyExc_ValueError,
3268 "embedded null character");
3269 return NULL;
3270 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003271 return buffer;
3272}
3273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003274#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275
Alexander Belopolsky40018472011-02-26 01:02:56 +00003276PyObject *
3277PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003278{
Victor Stinner8faf8212011-12-08 22:14:11 +01003279 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 PyErr_SetString(PyExc_ValueError,
3281 "chr() arg not in range(0x110000)");
3282 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003283 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003284
Victor Stinner985a82a2014-01-03 12:53:47 +01003285 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003286}
3287
Alexander Belopolsky40018472011-02-26 01:02:56 +00003288PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003289PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003291 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003293 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003294 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003295 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003296 Py_INCREF(obj);
3297 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003298 }
3299 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003300 /* For a Unicode subtype that's not a Unicode object,
3301 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003302 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003303 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003304 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003305 "Can't convert '%.100s' object to str implicitly",
3306 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003307 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003308}
3309
Alexander Belopolsky40018472011-02-26 01:02:56 +00003310PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003311PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003312 const char *encoding,
3313 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003314{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003315 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003316 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003317
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 PyErr_BadInternalCall();
3320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003322
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003323 /* Decoding bytes objects is the most common case and should be fast */
3324 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003325 if (PyBytes_GET_SIZE(obj) == 0) {
3326 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3327 return NULL;
3328 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003329 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003330 }
3331 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003332 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3333 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003334 }
3335
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003336 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003337 PyErr_SetString(PyExc_TypeError,
3338 "decoding str is not supported");
3339 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003340 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003341
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003342 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3343 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3344 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003345 "decoding to str: need a bytes-like object, %.80s found",
3346 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003347 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003348 }
Tim Petersced69f82003-09-16 20:30:58 +00003349
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003350 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003351 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003352 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3353 return NULL;
3354 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003355 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003357
Serhiy Storchaka05997252013-01-26 12:14:02 +02003358 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003359 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003360 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361}
3362
Victor Stinnerebe17e02016-10-12 13:57:45 +02003363/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3364 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3365 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003366int
3367_Py_normalize_encoding(const char *encoding,
3368 char *lower,
3369 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003371 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003372 char *l;
3373 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003374 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003375
Victor Stinner942889a2016-09-05 15:40:10 -07003376 assert(encoding != NULL);
3377
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003378 e = encoding;
3379 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003380 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003381 punct = 0;
3382 while (1) {
3383 char c = *e;
3384 if (c == 0) {
3385 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003386 }
Victor Stinner942889a2016-09-05 15:40:10 -07003387
3388 if (Py_ISALNUM(c) || c == '.') {
3389 if (punct && l != lower) {
3390 if (l == l_end) {
3391 return 0;
3392 }
3393 *l++ = '_';
3394 }
3395 punct = 0;
3396
3397 if (l == l_end) {
3398 return 0;
3399 }
3400 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003401 }
3402 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003403 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003404 }
Victor Stinner942889a2016-09-05 15:40:10 -07003405
3406 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003407 }
3408 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003409 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003410}
3411
Alexander Belopolsky40018472011-02-26 01:02:56 +00003412PyObject *
3413PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003414 Py_ssize_t size,
3415 const char *encoding,
3416 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003417{
3418 PyObject *buffer = NULL, *unicode;
3419 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003420 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3421
Victor Stinner22eb6892019-06-26 00:51:05 +02003422 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3423 return NULL;
3424 }
3425
Victor Stinnered076ed2019-06-26 01:49:32 +02003426 if (size == 0) {
3427 _Py_RETURN_UNICODE_EMPTY();
3428 }
3429
Victor Stinner942889a2016-09-05 15:40:10 -07003430 if (encoding == NULL) {
3431 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3432 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003433
Fred Drakee4315f52000-05-09 19:53:39 +00003434 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003435 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3436 char *lower = buflower;
3437
3438 /* Fast paths */
3439 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3440 lower += 3;
3441 if (*lower == '_') {
3442 /* Match "utf8" and "utf_8" */
3443 lower++;
3444 }
3445
3446 if (lower[0] == '8' && lower[1] == 0) {
3447 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3448 }
3449 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3450 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3451 }
3452 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3453 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3454 }
3455 }
3456 else {
3457 if (strcmp(lower, "ascii") == 0
3458 || strcmp(lower, "us_ascii") == 0) {
3459 return PyUnicode_DecodeASCII(s, size, errors);
3460 }
Steve Dowercc16be82016-09-08 10:35:16 -07003461 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003462 else if (strcmp(lower, "mbcs") == 0) {
3463 return PyUnicode_DecodeMBCS(s, size, errors);
3464 }
3465 #endif
3466 else if (strcmp(lower, "latin1") == 0
3467 || strcmp(lower, "latin_1") == 0
3468 || strcmp(lower, "iso_8859_1") == 0
3469 || strcmp(lower, "iso8859_1") == 0) {
3470 return PyUnicode_DecodeLatin1(s, size, errors);
3471 }
3472 }
Victor Stinner37296e82010-06-10 13:36:23 +00003473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474
3475 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003476 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003477 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003478 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003479 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 if (buffer == NULL)
3481 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003482 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 if (unicode == NULL)
3484 goto onError;
3485 if (!PyUnicode_Check(unicode)) {
3486 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003487 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003488 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003489 encoding,
3490 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 Py_DECREF(unicode);
3492 goto onError;
3493 }
3494 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003495 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003496
Benjamin Peterson29060642009-01-31 22:14:21 +00003497 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498 Py_XDECREF(buffer);
3499 return NULL;
3500}
3501
Alexander Belopolsky40018472011-02-26 01:02:56 +00003502PyObject *
3503PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003504 const char *encoding,
3505 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003506{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003507 if (!PyUnicode_Check(unicode)) {
3508 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003509 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003510 }
3511
Serhiy Storchaka00939072016-10-27 21:05:49 +03003512 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3513 "PyUnicode_AsDecodedObject() is deprecated; "
3514 "use PyCodec_Decode() to decode from str", 1) < 0)
3515 return NULL;
3516
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003517 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003518 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003519
3520 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003521 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003522}
3523
Alexander Belopolsky40018472011-02-26 01:02:56 +00003524PyObject *
3525PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003526 const char *encoding,
3527 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003528{
3529 PyObject *v;
3530
3531 if (!PyUnicode_Check(unicode)) {
3532 PyErr_BadArgument();
3533 goto onError;
3534 }
3535
Serhiy Storchaka00939072016-10-27 21:05:49 +03003536 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3537 "PyUnicode_AsDecodedUnicode() is deprecated; "
3538 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3539 return NULL;
3540
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003541 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003542 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003543
3544 /* Decode via the codec registry */
3545 v = PyCodec_Decode(unicode, encoding, errors);
3546 if (v == NULL)
3547 goto onError;
3548 if (!PyUnicode_Check(v)) {
3549 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003550 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003551 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003552 encoding,
3553 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003554 Py_DECREF(v);
3555 goto onError;
3556 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003557 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003558
Benjamin Peterson29060642009-01-31 22:14:21 +00003559 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003560 return NULL;
3561}
3562
Alexander Belopolsky40018472011-02-26 01:02:56 +00003563PyObject *
3564PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003565 Py_ssize_t size,
3566 const char *encoding,
3567 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568{
3569 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003570
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003571 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3575 Py_DECREF(unicode);
3576 return v;
3577}
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579PyObject *
3580PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 const char *encoding,
3582 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003583{
3584 PyObject *v;
3585
3586 if (!PyUnicode_Check(unicode)) {
3587 PyErr_BadArgument();
3588 goto onError;
3589 }
3590
Serhiy Storchaka00939072016-10-27 21:05:49 +03003591 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3592 "PyUnicode_AsEncodedObject() is deprecated; "
3593 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3594 "or PyCodec_Encode() for generic encoding", 1) < 0)
3595 return NULL;
3596
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003597 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003599
3600 /* Encode via the codec registry */
3601 v = PyCodec_Encode(unicode, encoding, errors);
3602 if (v == NULL)
3603 goto onError;
3604 return v;
3605
Benjamin Peterson29060642009-01-31 22:14:21 +00003606 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003607 return NULL;
3608}
3609
Victor Stinner1b579672011-12-17 05:47:23 +01003610
Victor Stinner2cba6b82018-01-10 22:46:15 +01003611static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003612unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003613 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003614{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003615 Py_ssize_t wlen;
3616 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3617 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003618 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003619 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003620
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003621 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003622 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003623 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003624 return NULL;
3625 }
3626
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003627 char *str;
3628 size_t error_pos;
3629 const char *reason;
3630 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003631 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003632 PyMem_Free(wstr);
3633
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003634 if (res != 0) {
3635 if (res == -2) {
3636 PyObject *exc;
3637 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3638 "locale", unicode,
3639 (Py_ssize_t)error_pos,
3640 (Py_ssize_t)(error_pos+1),
3641 reason);
3642 if (exc != NULL) {
3643 PyCodec_StrictErrors(exc);
3644 Py_DECREF(exc);
3645 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003646 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003647 else if (res == -3) {
3648 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3649 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003650 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003651 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003652 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003653 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003654 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003655
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003656 PyObject *bytes = PyBytes_FromString(str);
3657 PyMem_RawFree(str);
3658 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003659}
3660
Victor Stinnerad158722010-10-27 00:25:46 +00003661PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003662PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3663{
Victor Stinner709d23d2019-05-02 14:56:30 -04003664 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3665 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003666}
3667
3668PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003669PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003670{
Victor Stinner81a7be32020-04-14 15:14:01 +02003671 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003672 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3673 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003674 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003675 fs_codec->error_handler,
3676 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003677 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003678#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003679 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003680 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003681 fs_codec->encoding,
3682 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003683 }
Victor Stinnerad158722010-10-27 00:25:46 +00003684#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003685 else {
3686 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3687 machinery is not ready and so cannot be used:
3688 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003689 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3690 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003691 assert(filesystem_errors != NULL);
3692 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3693 assert(errors != _Py_ERROR_UNKNOWN);
3694#ifdef _Py_FORCE_UTF8_FS_ENCODING
3695 return unicode_encode_utf8(unicode, errors, NULL);
3696#else
3697 return unicode_encode_locale(unicode, errors, 0);
3698#endif
3699 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003700}
3701
Alexander Belopolsky40018472011-02-26 01:02:56 +00003702PyObject *
3703PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003704 const char *encoding,
3705 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706{
3707 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003708 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003709
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 if (!PyUnicode_Check(unicode)) {
3711 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 }
Fred Drakee4315f52000-05-09 19:53:39 +00003714
Victor Stinner22eb6892019-06-26 00:51:05 +02003715 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3716 return NULL;
3717 }
3718
Victor Stinner942889a2016-09-05 15:40:10 -07003719 if (encoding == NULL) {
3720 return _PyUnicode_AsUTF8String(unicode, errors);
3721 }
3722
Fred Drakee4315f52000-05-09 19:53:39 +00003723 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003724 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3725 char *lower = buflower;
3726
3727 /* Fast paths */
3728 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3729 lower += 3;
3730 if (*lower == '_') {
3731 /* Match "utf8" and "utf_8" */
3732 lower++;
3733 }
3734
3735 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003736 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003737 }
3738 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3739 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3740 }
3741 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3742 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3743 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003744 }
Victor Stinner942889a2016-09-05 15:40:10 -07003745 else {
3746 if (strcmp(lower, "ascii") == 0
3747 || strcmp(lower, "us_ascii") == 0) {
3748 return _PyUnicode_AsASCIIString(unicode, errors);
3749 }
Steve Dowercc16be82016-09-08 10:35:16 -07003750#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003751 else if (strcmp(lower, "mbcs") == 0) {
3752 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3753 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003754#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003755 else if (strcmp(lower, "latin1") == 0 ||
3756 strcmp(lower, "latin_1") == 0 ||
3757 strcmp(lower, "iso_8859_1") == 0 ||
3758 strcmp(lower, "iso8859_1") == 0) {
3759 return _PyUnicode_AsLatin1String(unicode, errors);
3760 }
3761 }
Victor Stinner37296e82010-06-10 13:36:23 +00003762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763
3764 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003765 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003767 return NULL;
3768
3769 /* The normal path */
3770 if (PyBytes_Check(v))
3771 return v;
3772
3773 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003774 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003775 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003776 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003777
3778 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003779 "encoder %s returned bytearray instead of bytes; "
3780 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003781 encoding);
3782 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003783 Py_DECREF(v);
3784 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003785 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003786
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003787 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3788 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003789 Py_DECREF(v);
3790 return b;
3791 }
3792
3793 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003794 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003795 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003796 encoding,
3797 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003798 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003799 return NULL;
3800}
3801
Alexander Belopolsky40018472011-02-26 01:02:56 +00003802PyObject *
3803PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003804 const char *encoding,
3805 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003806{
3807 PyObject *v;
3808
3809 if (!PyUnicode_Check(unicode)) {
3810 PyErr_BadArgument();
3811 goto onError;
3812 }
3813
Serhiy Storchaka00939072016-10-27 21:05:49 +03003814 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3815 "PyUnicode_AsEncodedUnicode() is deprecated; "
3816 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3817 return NULL;
3818
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003819 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003820 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003821
3822 /* Encode via the codec registry */
3823 v = PyCodec_Encode(unicode, encoding, errors);
3824 if (v == NULL)
3825 goto onError;
3826 if (!PyUnicode_Check(v)) {
3827 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003828 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003829 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003830 encoding,
3831 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003832 Py_DECREF(v);
3833 goto onError;
3834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003836
Benjamin Peterson29060642009-01-31 22:14:21 +00003837 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 return NULL;
3839}
3840
Victor Stinner2cba6b82018-01-10 22:46:15 +01003841static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003842unicode_decode_locale(const char *str, Py_ssize_t len,
3843 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003844{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003845 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3846 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003847 return NULL;
3848 }
3849
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003850 wchar_t *wstr;
3851 size_t wlen;
3852 const char *reason;
3853 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003854 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003855 if (res != 0) {
3856 if (res == -2) {
3857 PyObject *exc;
3858 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3859 "locale", str, len,
3860 (Py_ssize_t)wlen,
3861 (Py_ssize_t)(wlen + 1),
3862 reason);
3863 if (exc != NULL) {
3864 PyCodec_StrictErrors(exc);
3865 Py_DECREF(exc);
3866 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003867 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003868 else if (res == -3) {
3869 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3870 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003871 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003872 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003873 }
Victor Stinner2f197072011-12-17 07:08:30 +01003874 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003875 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003876
3877 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3878 PyMem_RawFree(wstr);
3879 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003880}
3881
3882PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003883PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3884 const char *errors)
3885{
Victor Stinner709d23d2019-05-02 14:56:30 -04003886 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3887 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003888}
3889
3890PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003891PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003892{
3893 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003894 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3895 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003896}
3897
3898
3899PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003900PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003901 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003902 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3903}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003904
Christian Heimes5894ba72007-11-04 11:43:14 +00003905PyObject*
3906PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3907{
Victor Stinner81a7be32020-04-14 15:14:01 +02003908 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003909 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3910 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003911 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003912 fs_codec->error_handler,
3913 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003914 NULL);
3915 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003916#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003917 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003918 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003919 fs_codec->encoding,
3920 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003921 }
Victor Stinnerad158722010-10-27 00:25:46 +00003922#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003923 else {
3924 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3925 machinery is not ready and so cannot be used:
3926 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003927 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3928 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003929 assert(filesystem_errors != NULL);
3930 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3931 assert(errors != _Py_ERROR_UNKNOWN);
3932#ifdef _Py_FORCE_UTF8_FS_ENCODING
3933 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3934#else
3935 return unicode_decode_locale(s, size, errors, 0);
3936#endif
3937 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003938}
3939
Martin v. Löwis011e8422009-05-05 04:43:17 +00003940
3941int
3942PyUnicode_FSConverter(PyObject* arg, void* addr)
3943{
Brett Cannonec6ce872016-09-06 15:50:29 -07003944 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003945 PyObject *output = NULL;
3946 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003947 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003948 if (arg == NULL) {
3949 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003950 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003951 return 1;
3952 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003953 path = PyOS_FSPath(arg);
3954 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003955 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003956 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003957 if (PyBytes_Check(path)) {
3958 output = path;
3959 }
3960 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3961 output = PyUnicode_EncodeFSDefault(path);
3962 Py_DECREF(path);
3963 if (!output) {
3964 return 0;
3965 }
3966 assert(PyBytes_Check(output));
3967 }
3968
Victor Stinner0ea2a462010-04-30 00:22:08 +00003969 size = PyBytes_GET_SIZE(output);
3970 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003971 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003972 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003973 Py_DECREF(output);
3974 return 0;
3975 }
3976 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003977 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003978}
3979
3980
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003981int
3982PyUnicode_FSDecoder(PyObject* arg, void* addr)
3983{
Brett Cannona5711202016-09-06 19:36:01 -07003984 int is_buffer = 0;
3985 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003986 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003987 if (arg == NULL) {
3988 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003989 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003990 return 1;
3991 }
Brett Cannona5711202016-09-06 19:36:01 -07003992
3993 is_buffer = PyObject_CheckBuffer(arg);
3994 if (!is_buffer) {
3995 path = PyOS_FSPath(arg);
3996 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003997 return 0;
3998 }
Brett Cannona5711202016-09-06 19:36:01 -07003999 }
4000 else {
4001 path = arg;
4002 Py_INCREF(arg);
4003 }
4004
4005 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004006 output = path;
4007 }
4008 else if (PyBytes_Check(path) || is_buffer) {
4009 PyObject *path_bytes = NULL;
4010
4011 if (!PyBytes_Check(path) &&
4012 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004013 "path should be string, bytes, or os.PathLike, not %.200s",
4014 Py_TYPE(arg)->tp_name)) {
4015 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004016 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004017 }
4018 path_bytes = PyBytes_FromObject(path);
4019 Py_DECREF(path);
4020 if (!path_bytes) {
4021 return 0;
4022 }
4023 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4024 PyBytes_GET_SIZE(path_bytes));
4025 Py_DECREF(path_bytes);
4026 if (!output) {
4027 return 0;
4028 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004029 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004030 else {
4031 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004032 "path should be string, bytes, or os.PathLike, not %.200s",
4033 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004034 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004035 return 0;
4036 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004037 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004038 Py_DECREF(output);
4039 return 0;
4040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004042 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004043 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004044 Py_DECREF(output);
4045 return 0;
4046 }
4047 *(PyObject**)addr = output;
4048 return Py_CLEANUP_SUPPORTED;
4049}
4050
4051
Inada Naoki02a4d572020-02-27 13:48:59 +09004052static int unicode_fill_utf8(PyObject *unicode);
4053
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004054const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004056{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004057 if (!PyUnicode_Check(unicode)) {
4058 PyErr_BadArgument();
4059 return NULL;
4060 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004061 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004062 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004064 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004065 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 return NULL;
4067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 }
4069
4070 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004071 *psize = PyUnicode_UTF8_LENGTH(unicode);
4072 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004073}
4074
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004075const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4079}
4080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081Py_UNICODE *
4082PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 if (!PyUnicode_Check(unicode)) {
4085 PyErr_BadArgument();
4086 return NULL;
4087 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004088 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4089 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004091 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004092 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093
Serhiy Storchakac46db922018-10-23 22:58:24 +03004094 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4095 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4096 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004099 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4100 if (w == NULL) {
4101 PyErr_NoMemory();
4102 return NULL;
4103 }
4104 unicode_copy_as_widechar(unicode, w, wlen + 1);
4105 _PyUnicode_WSTR(unicode) = w;
4106 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4107 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 }
4109 }
4110 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004111 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004112 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004113}
4114
Inada Naoki2c4928d2020-06-17 20:09:44 +09004115/* Deprecated APIs */
4116
4117_Py_COMP_DIAG_PUSH
4118_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4119
Alexander Belopolsky40018472011-02-26 01:02:56 +00004120Py_UNICODE *
4121PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124}
4125
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004126const Py_UNICODE *
4127_PyUnicode_AsUnicode(PyObject *unicode)
4128{
4129 Py_ssize_t size;
4130 const Py_UNICODE *wstr;
4131
4132 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4133 if (wstr && wcslen(wstr) != (size_t)size) {
4134 PyErr_SetString(PyExc_ValueError, "embedded null character");
4135 return NULL;
4136 }
4137 return wstr;
4138}
4139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140
Alexander Belopolsky40018472011-02-26 01:02:56 +00004141Py_ssize_t
4142PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143{
4144 if (!PyUnicode_Check(unicode)) {
4145 PyErr_BadArgument();
4146 goto onError;
4147 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004148 if (_PyUnicode_WSTR(unicode) == NULL) {
4149 if (PyUnicode_AsUnicode(unicode) == NULL)
4150 goto onError;
4151 }
4152 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 return -1;
4156}
4157
Inada Naoki2c4928d2020-06-17 20:09:44 +09004158_Py_COMP_DIAG_POP
4159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004160Py_ssize_t
4161PyUnicode_GetLength(PyObject *unicode)
4162{
Victor Stinner07621332012-06-16 04:53:46 +02004163 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164 PyErr_BadArgument();
4165 return -1;
4166 }
Victor Stinner07621332012-06-16 04:53:46 +02004167 if (PyUnicode_READY(unicode) == -1)
4168 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 return PyUnicode_GET_LENGTH(unicode);
4170}
4171
4172Py_UCS4
4173PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4174{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004175 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004176 int kind;
4177
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004178 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004179 PyErr_BadArgument();
4180 return (Py_UCS4)-1;
4181 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004182 if (PyUnicode_READY(unicode) == -1) {
4183 return (Py_UCS4)-1;
4184 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004185 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004186 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004187 return (Py_UCS4)-1;
4188 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004189 data = PyUnicode_DATA(unicode);
4190 kind = PyUnicode_KIND(unicode);
4191 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192}
4193
4194int
4195PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4196{
4197 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004198 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004199 return -1;
4200 }
Victor Stinner488fa492011-12-12 00:01:39 +01004201 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004202 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004203 PyErr_SetString(PyExc_IndexError, "string index out of range");
4204 return -1;
4205 }
Victor Stinner488fa492011-12-12 00:01:39 +01004206 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004207 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004208 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4209 PyErr_SetString(PyExc_ValueError, "character out of range");
4210 return -1;
4211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004212 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4213 index, ch);
4214 return 0;
4215}
4216
Alexander Belopolsky40018472011-02-26 01:02:56 +00004217const char *
4218PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004219{
Victor Stinner42cb4622010-09-01 19:39:01 +00004220 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004221}
4222
Victor Stinner554f3f02010-06-16 23:33:54 +00004223/* create or adjust a UnicodeDecodeError */
4224static void
4225make_decode_exception(PyObject **exceptionObject,
4226 const char *encoding,
4227 const char *input, Py_ssize_t length,
4228 Py_ssize_t startpos, Py_ssize_t endpos,
4229 const char *reason)
4230{
4231 if (*exceptionObject == NULL) {
4232 *exceptionObject = PyUnicodeDecodeError_Create(
4233 encoding, input, length, startpos, endpos, reason);
4234 }
4235 else {
4236 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4237 goto onError;
4238 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4239 goto onError;
4240 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4241 goto onError;
4242 }
4243 return;
4244
4245onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004246 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004247}
4248
Steve Dowercc16be82016-09-08 10:35:16 -07004249#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004250static int
4251widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4252{
4253 if (newsize > *size) {
4254 wchar_t *newbuf = *buf;
4255 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4256 PyErr_NoMemory();
4257 return -1;
4258 }
4259 *buf = newbuf;
4260 }
4261 *size = newsize;
4262 return 0;
4263}
4264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265/* error handling callback helper:
4266 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004267 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 and adjust various state variables.
4269 return 0 on success, -1 on error
4270*/
4271
Alexander Belopolsky40018472011-02-26 01:02:56 +00004272static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273unicode_decode_call_errorhandler_wchar(
4274 const char *errors, PyObject **errorHandler,
4275 const char *encoding, const char *reason,
4276 const char **input, const char **inend, Py_ssize_t *startinpos,
4277 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004278 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004280 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004281
4282 PyObject *restuple = NULL;
4283 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004284 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004285 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004286 Py_ssize_t requiredsize;
4287 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004288 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 wchar_t *repwstr;
4290 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004291
4292 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004293 *errorHandler = PyCodec_LookupError(errors);
4294 if (*errorHandler == NULL)
4295 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 }
4297
Victor Stinner554f3f02010-06-16 23:33:54 +00004298 make_decode_exception(exceptionObject,
4299 encoding,
4300 *input, *inend - *input,
4301 *startinpos, *endinpos,
4302 reason);
4303 if (*exceptionObject == NULL)
4304 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305
Petr Viktorinffd97532020-02-11 17:46:57 +01004306 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004307 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004310 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004313 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315
4316 /* Copy back the bytes variables, which might have been modified by the
4317 callback */
4318 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4319 if (!inputobj)
4320 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321 *input = PyBytes_AS_STRING(inputobj);
4322 insize = PyBytes_GET_SIZE(inputobj);
4323 *inend = *input + insize;
4324 /* we can DECREF safely, as the exception has another reference,
4325 so the object won't go away. */
4326 Py_DECREF(inputobj);
4327
4328 if (newpos<0)
4329 newpos = insize+newpos;
4330 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004331 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332 goto onError;
4333 }
4334
4335 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4336 if (repwstr == NULL)
4337 goto onError;
4338 /* need more space? (at least enough for what we
4339 have+the replacement+the rest of the string (starting
4340 at the new input position), so we won't have to check space
4341 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004342 requiredsize = *outpos;
4343 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4344 goto overflow;
4345 requiredsize += repwlen;
4346 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4347 goto overflow;
4348 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004349 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004351 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004353 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004355 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004357 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004358 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004359 *endinpos = newpos;
4360 *inptr = *input + newpos;
4361
4362 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004363 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364 return 0;
4365
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004366 overflow:
4367 PyErr_SetString(PyExc_OverflowError,
4368 "decoded result is too long for a Python string");
4369
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370 onError:
4371 Py_XDECREF(restuple);
4372 return -1;
4373}
Steve Dowercc16be82016-09-08 10:35:16 -07004374#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004375
4376static int
4377unicode_decode_call_errorhandler_writer(
4378 const char *errors, PyObject **errorHandler,
4379 const char *encoding, const char *reason,
4380 const char **input, const char **inend, Py_ssize_t *startinpos,
4381 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4382 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4383{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004384 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004385
4386 PyObject *restuple = NULL;
4387 PyObject *repunicode = NULL;
4388 Py_ssize_t insize;
4389 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004390 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004391 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004393 int need_to_grow = 0;
4394 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395
4396 if (*errorHandler == NULL) {
4397 *errorHandler = PyCodec_LookupError(errors);
4398 if (*errorHandler == NULL)
4399 goto onError;
4400 }
4401
4402 make_decode_exception(exceptionObject,
4403 encoding,
4404 *input, *inend - *input,
4405 *startinpos, *endinpos,
4406 reason);
4407 if (*exceptionObject == NULL)
4408 goto onError;
4409
Petr Viktorinffd97532020-02-11 17:46:57 +01004410 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004411 if (restuple == NULL)
4412 goto onError;
4413 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004414 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004415 goto onError;
4416 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004417 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004418 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004419
4420 /* Copy back the bytes variables, which might have been modified by the
4421 callback */
4422 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4423 if (!inputobj)
4424 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004425 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004426 *input = PyBytes_AS_STRING(inputobj);
4427 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004428 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004429 /* we can DECREF safely, as the exception has another reference,
4430 so the object won't go away. */
4431 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004435 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004436 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004438 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004439
Victor Stinner170ca6f2013-04-18 00:25:28 +02004440 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004441 if (replen > 1) {
4442 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004443 need_to_grow = 1;
4444 }
4445 new_inptr = *input + newpos;
4446 if (*inend - new_inptr > remain) {
4447 /* We don't know the decoding algorithm here so we make the worst
4448 assumption that one byte decodes to one unicode character.
4449 If unfortunately one byte could decode to more unicode characters,
4450 the decoder may write out-of-bound then. Is it possible for the
4451 algorithms using this function? */
4452 writer->min_length += *inend - new_inptr - remain;
4453 need_to_grow = 1;
4454 }
4455 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004456 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004457 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004458 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4459 goto onError;
4460 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004461 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004462 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004465 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004466
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004468 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004473 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474}
4475
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476/* --- UTF-7 Codec -------------------------------------------------------- */
4477
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478/* See RFC2152 for details. We encode conservatively and decode liberally. */
4479
4480/* Three simple macros defining base-64. */
4481
4482/* Is c a base-64 character? */
4483
4484#define IS_BASE64(c) \
4485 (((c) >= 'A' && (c) <= 'Z') || \
4486 ((c) >= 'a' && (c) <= 'z') || \
4487 ((c) >= '0' && (c) <= '9') || \
4488 (c) == '+' || (c) == '/')
4489
4490/* given that c is a base-64 character, what is its base-64 value? */
4491
4492#define FROM_BASE64(c) \
4493 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4494 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4495 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4496 (c) == '+' ? 62 : 63)
4497
4498/* What is the base-64 character of the bottom 6 bits of n? */
4499
4500#define TO_BASE64(n) \
4501 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4502
4503/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4504 * decoded as itself. We are permissive on decoding; the only ASCII
4505 * byte not decoding to itself is the + which begins a base64
4506 * string. */
4507
4508#define DECODE_DIRECT(c) \
4509 ((c) <= 127 && (c) != '+')
4510
4511/* The UTF-7 encoder treats ASCII characters differently according to
4512 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4513 * the above). See RFC2152. This array identifies these different
4514 * sets:
4515 * 0 : "Set D"
4516 * alphanumeric and '(),-./:?
4517 * 1 : "Set O"
4518 * !"#$%&*;<=>@[]^_`{|}
4519 * 2 : "whitespace"
4520 * ht nl cr sp
4521 * 3 : special (must be base64 encoded)
4522 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4523 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Tim Petersced69f82003-09-16 20:30:58 +00004525static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526char utf7_category[128] = {
4527/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4528 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4529/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4530 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4531/* sp ! " # $ % & ' ( ) * + , - . / */
4532 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4533/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4535/* @ A B C D E F G H I J K L M N O */
4536 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4537/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4539/* ` a b c d e f g h i j k l m n o */
4540 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4541/* p q r s t u v w x y z { | } ~ del */
4542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543};
4544
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545/* ENCODE_DIRECT: this character should be encoded as itself. The
4546 * answer depends on whether we are encoding set O as itself, and also
4547 * on whether we are encoding whitespace as itself. RFC2152 makes it
4548 * clear that the answers to these questions vary between
4549 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004550
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551#define ENCODE_DIRECT(c, directO, directWS) \
4552 ((c) < 128 && (c) > 0 && \
4553 ((utf7_category[(c)] == 0) || \
4554 (directWS && (utf7_category[(c)] == 2)) || \
4555 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556
Alexander Belopolsky40018472011-02-26 01:02:56 +00004557PyObject *
4558PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004559 Py_ssize_t size,
4560 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4563}
4564
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565/* The decoder. The only state we preserve is our read position,
4566 * i.e. how many characters we have consumed. So if we end in the
4567 * middle of a shift sequence we have to back off the read position
4568 * and the output to the beginning of the sequence, otherwise we lose
4569 * all the shift state (seen bits, number of bits seen, high
4570 * surrogate). */
4571
Alexander Belopolsky40018472011-02-26 01:02:56 +00004572PyObject *
4573PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004574 Py_ssize_t size,
4575 const char *errors,
4576 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004577{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004579 Py_ssize_t startinpos;
4580 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004582 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583 const char *errmsg = "";
4584 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004585 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 unsigned int base64bits = 0;
4587 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004588 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 PyObject *errorHandler = NULL;
4590 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004591
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004592 if (size == 0) {
4593 if (consumed)
4594 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004595 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004596 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004597
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004598 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004599 _PyUnicodeWriter_Init(&writer);
4600 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004601
4602 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603 e = s + size;
4604
4605 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004608 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 if (inShift) { /* in a base-64 section */
4611 if (IS_BASE64(ch)) { /* consume a base-64 character */
4612 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4613 base64bits += 6;
4614 s++;
4615 if (base64bits >= 16) {
4616 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004617 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 base64bits -= 16;
4619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004620 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 if (surrogate) {
4622 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004623 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4624 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004625 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004626 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004627 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004628 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 }
4630 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004631 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004632 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 }
4635 }
Victor Stinner551ac952011-11-29 22:58:13 +01004636 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 /* first surrogate */
4638 surrogate = outCh;
4639 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004641 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004642 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 }
4644 }
4645 }
4646 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 if (base64bits > 0) { /* left-over bits */
4649 if (base64bits >= 6) {
4650 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004651 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 errmsg = "partial character in shift sequence";
4653 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 else {
4656 /* Some bits remain; they should be zero */
4657 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004658 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659 errmsg = "non-zero padding bits in shift sequence";
4660 goto utf7Error;
4661 }
4662 }
4663 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004664 if (surrogate && DECODE_DIRECT(ch)) {
4665 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4666 goto onError;
4667 }
4668 surrogate = 0;
4669 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004670 /* '-' is absorbed; other terminating
4671 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004672 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004674 }
4675 }
4676 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004677 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 s++; /* consume '+' */
4679 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004680 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004681 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004682 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004684 else if (s < e && !IS_BASE64(*s)) {
4685 s++;
4686 errmsg = "ill-formed sequence";
4687 goto utf7Error;
4688 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004690 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004691 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004692 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004694 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
4696 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004698 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004699 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004700 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 else {
4703 startinpos = s-starts;
4704 s++;
4705 errmsg = "unexpected special character";
4706 goto utf7Error;
4707 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004711 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 errors, &errorHandler,
4713 "utf7", errmsg,
4714 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004715 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717 }
4718
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 /* end of string */
4720
4721 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4722 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004723 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 if (surrogate ||
4725 (base64bits >= 6) ||
4726 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004728 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 errors, &errorHandler,
4730 "utf7", "unterminated shift sequence",
4731 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004732 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 goto onError;
4734 if (s < e)
4735 goto restart;
4736 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004737 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004738
4739 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004740 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004742 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004743 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004744 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004745 writer.kind, writer.data, shiftOutStart);
4746 Py_XDECREF(errorHandler);
4747 Py_XDECREF(exc);
4748 _PyUnicodeWriter_Dealloc(&writer);
4749 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004750 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004751 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 }
4753 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004754 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004755 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004756 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004758 Py_XDECREF(errorHandler);
4759 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004760 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 Py_XDECREF(errorHandler);
4764 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004765 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766 return NULL;
4767}
4768
4769
Alexander Belopolsky40018472011-02-26 01:02:56 +00004770PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004771_PyUnicode_EncodeUTF7(PyObject *str,
4772 int base64SetO,
4773 int base64WhiteSpace,
4774 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004776 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004777 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004778 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004779 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004781 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 unsigned int base64bits = 0;
4783 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004784 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004785 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786
Benjamin Petersonbac79492012-01-14 13:34:47 -05004787 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004788 return NULL;
4789 kind = PyUnicode_KIND(str);
4790 data = PyUnicode_DATA(str);
4791 len = PyUnicode_GET_LENGTH(str);
4792
4793 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004795
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004796 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004797 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004798 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004799 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004800 if (v == NULL)
4801 return NULL;
4802
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004803 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004804 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004805 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 if (inShift) {
4808 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4809 /* shifting out */
4810 if (base64bits) { /* output remaining bits */
4811 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4812 base64buffer = 0;
4813 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004814 }
4815 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 /* Characters not in the BASE64 set implicitly unshift the sequence
4817 so no '-' is required, except if the character is itself a '-' */
4818 if (IS_BASE64(ch) || ch == '-') {
4819 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004820 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 *out++ = (char) ch;
4822 }
4823 else {
4824 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004825 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 else { /* not in a shift sequence */
4828 if (ch == '+') {
4829 *out++ = '+';
4830 *out++ = '-';
4831 }
4832 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4833 *out++ = (char) ch;
4834 }
4835 else {
4836 *out++ = '+';
4837 inShift = 1;
4838 goto encode_char;
4839 }
4840 }
4841 continue;
4842encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004843 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004844 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004845
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846 /* code first surrogate */
4847 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004848 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004849 while (base64bits >= 6) {
4850 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4851 base64bits -= 6;
4852 }
4853 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004854 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004855 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004856 base64bits += 16;
4857 base64buffer = (base64buffer << 16) | ch;
4858 while (base64bits >= 6) {
4859 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4860 base64bits -= 6;
4861 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004862 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863 if (base64bits)
4864 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4865 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004866 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004867 if (_PyBytes_Resize(&v, out - start) < 0)
4868 return NULL;
4869 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004870}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004871PyObject *
4872PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4873 Py_ssize_t size,
4874 int base64SetO,
4875 int base64WhiteSpace,
4876 const char *errors)
4877{
4878 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004879 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004880 if (tmp == NULL)
4881 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004882 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004883 base64WhiteSpace, errors);
4884 Py_DECREF(tmp);
4885 return result;
4886}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004887
Antoine Pitrou244651a2009-05-04 18:56:13 +00004888#undef IS_BASE64
4889#undef FROM_BASE64
4890#undef TO_BASE64
4891#undef DECODE_DIRECT
4892#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004893
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894/* --- UTF-8 Codec -------------------------------------------------------- */
4895
Alexander Belopolsky40018472011-02-26 01:02:56 +00004896PyObject *
4897PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004898 Py_ssize_t size,
4899 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900{
Walter Dörwald69652032004-09-07 20:24:22 +00004901 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4902}
4903
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904#include "stringlib/asciilib.h"
4905#include "stringlib/codecs.h"
4906#include "stringlib/undef.h"
4907
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004908#include "stringlib/ucs1lib.h"
4909#include "stringlib/codecs.h"
4910#include "stringlib/undef.h"
4911
4912#include "stringlib/ucs2lib.h"
4913#include "stringlib/codecs.h"
4914#include "stringlib/undef.h"
4915
4916#include "stringlib/ucs4lib.h"
4917#include "stringlib/codecs.h"
4918#include "stringlib/undef.h"
4919
Antoine Pitrouab868312009-01-10 15:40:25 +00004920/* Mask to quickly check whether a C 'long' contains a
4921 non-ASCII, UTF8-encoded char. */
4922#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004923# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004924#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004925# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004926#else
4927# error C 'long' size should be either 4 or 8!
4928#endif
4929
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930static Py_ssize_t
4931ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004932{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004934 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004935
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004936 /*
4937 * Issue #17237: m68k is a bit different from most architectures in
4938 * that objects do not use "natural alignment" - for example, int and
4939 * long are only aligned at 2-byte boundaries. Therefore the assert()
4940 * won't work; also, tests have shown that skipping the "optimised
4941 * version" will even speed up m68k.
4942 */
4943#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004945 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4946 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 /* Fast path, see in STRINGLIB(utf8_decode) for
4948 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004949 /* Help allocation */
4950 const char *_p = p;
4951 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 while (_p < aligned_end) {
4953 unsigned long value = *(const unsigned long *) _p;
4954 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 *((unsigned long *)q) = value;
4957 _p += SIZEOF_LONG;
4958 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004959 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 p = _p;
4961 while (p < end) {
4962 if ((unsigned char)*p & 0x80)
4963 break;
4964 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004969#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970 while (p < end) {
4971 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4972 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004973 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004974 /* Help allocation */
4975 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004976 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004977 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 if (value & ASCII_CHAR_MASK)
4979 break;
4980 _p += SIZEOF_LONG;
4981 }
4982 p = _p;
4983 if (_p == end)
4984 break;
4985 }
4986 if ((unsigned char)*p & 0x80)
4987 break;
4988 ++p;
4989 }
4990 memcpy(dest, start, p - start);
4991 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992}
Antoine Pitrouab868312009-01-10 15:40:25 +00004993
Victor Stinner709d23d2019-05-02 14:56:30 -04004994static PyObject *
4995unicode_decode_utf8(const char *s, Py_ssize_t size,
4996 _Py_error_handler error_handler, const char *errors,
4997 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004998{
Victor Stinner785938e2011-12-11 20:09:03 +01004999 if (size == 0) {
5000 if (consumed)
5001 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005002 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005003 }
5004
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5006 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005007 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005009 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005011 }
5012
Inada Naoki770847a2019-06-24 12:30:24 +09005013 const char *starts = s;
5014 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005015
Inada Naoki770847a2019-06-24 12:30:24 +09005016 // fast path: try ASCII string.
5017 PyObject *u = PyUnicode_New(size, 127);
5018 if (u == NULL) {
5019 return NULL;
5020 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005021 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005022 if (s == end) {
5023 return u;
5024 }
5025
5026 // Use _PyUnicodeWriter after fast path is failed.
5027 _PyUnicodeWriter writer;
5028 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5029 writer.pos = s - starts;
5030
5031 Py_ssize_t startinpos, endinpos;
5032 const char *errmsg = "";
5033 PyObject *error_handler_obj = NULL;
5034 PyObject *exc = NULL;
5035
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 while (s < end) {
5037 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005038 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005039
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005040 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005041 if (PyUnicode_IS_ASCII(writer.buffer))
5042 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005044 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005045 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005046 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005047 } else {
5048 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005049 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050 }
5051
5052 switch (ch) {
5053 case 0:
5054 if (s == end || consumed)
5055 goto End;
5056 errmsg = "unexpected end of data";
5057 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005058 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 break;
5060 case 1:
5061 errmsg = "invalid start byte";
5062 startinpos = s - starts;
5063 endinpos = startinpos + 1;
5064 break;
5065 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005066 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5067 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5068 {
5069 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005070 goto End;
5071 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005072 /* fall through */
5073 case 3:
5074 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 errmsg = "invalid continuation byte";
5076 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005077 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 break;
5079 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005080 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 goto onError;
5082 continue;
5083 }
5084
Victor Stinner1d65d912015-10-05 13:43:50 +02005085 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005086 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005087
5088 switch (error_handler) {
5089 case _Py_ERROR_IGNORE:
5090 s += (endinpos - startinpos);
5091 break;
5092
5093 case _Py_ERROR_REPLACE:
5094 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5095 goto onError;
5096 s += (endinpos - startinpos);
5097 break;
5098
5099 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005100 {
5101 Py_ssize_t i;
5102
Victor Stinner1d65d912015-10-05 13:43:50 +02005103 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5104 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005105 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005106 ch = (Py_UCS4)(unsigned char)(starts[i]);
5107 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5108 ch + 0xdc00);
5109 writer.pos++;
5110 }
5111 s += (endinpos - startinpos);
5112 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005113 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005114
5115 default:
5116 if (unicode_decode_call_errorhandler_writer(
5117 errors, &error_handler_obj,
5118 "utf-8", errmsg,
5119 &starts, &end, &startinpos, &endinpos, &exc, &s,
5120 &writer))
5121 goto onError;
5122 }
Victor Stinner785938e2011-12-11 20:09:03 +01005123 }
5124
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005125End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 if (consumed)
5127 *consumed = s - starts;
5128
Victor Stinner1d65d912015-10-05 13:43:50 +02005129 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005131 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005132
5133onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005134 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005136 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005138}
5139
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140
Victor Stinner709d23d2019-05-02 14:56:30 -04005141PyObject *
5142PyUnicode_DecodeUTF8Stateful(const char *s,
5143 Py_ssize_t size,
5144 const char *errors,
5145 Py_ssize_t *consumed)
5146{
5147 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5148}
5149
5150
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005151/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5152 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005153
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005154 On success, write a pointer to a newly allocated wide character string into
5155 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5156 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005157
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005158 On memory allocation failure, return -1.
5159
5160 On decoding error (if surrogateescape is zero), return -2. If wlen is
5161 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5162 is not NULL, write the decoding error message into *reason. */
5163int
5164_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005165 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005166{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005167 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005168 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005169 wchar_t *unicode;
5170 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005171
Victor Stinner3d4226a2018-08-29 22:21:32 +02005172 int surrogateescape = 0;
5173 int surrogatepass = 0;
5174 switch (errors)
5175 {
5176 case _Py_ERROR_STRICT:
5177 break;
5178 case _Py_ERROR_SURROGATEESCAPE:
5179 surrogateescape = 1;
5180 break;
5181 case _Py_ERROR_SURROGATEPASS:
5182 surrogatepass = 1;
5183 break;
5184 default:
5185 return -3;
5186 }
5187
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005188 /* Note: size will always be longer than the resulting Unicode
5189 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005190 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005191 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005192 }
5193
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005194 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005195 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005196 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005197 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005198
5199 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005200 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005201 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005202 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005203 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005204#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005206#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005207 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005208#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 if (ch > 0xFF) {
5210#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005211 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005212#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005213 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005214 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005215 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5216 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5217#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005218 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005219 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005220 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005221 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005222 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005223
5224 if (surrogateescape) {
5225 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5226 }
5227 else {
5228 /* Is it a valid three-byte code? */
5229 if (surrogatepass
5230 && (e - s) >= 3
5231 && (s[0] & 0xf0) == 0xe0
5232 && (s[1] & 0xc0) == 0x80
5233 && (s[2] & 0xc0) == 0x80)
5234 {
5235 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5236 s += 3;
5237 unicode[outpos++] = ch;
5238 }
5239 else {
5240 PyMem_RawFree(unicode );
5241 if (reason != NULL) {
5242 switch (ch) {
5243 case 0:
5244 *reason = "unexpected end of data";
5245 break;
5246 case 1:
5247 *reason = "invalid start byte";
5248 break;
5249 /* 2, 3, 4 */
5250 default:
5251 *reason = "invalid continuation byte";
5252 break;
5253 }
5254 }
5255 if (wlen != NULL) {
5256 *wlen = s - orig_s;
5257 }
5258 return -2;
5259 }
5260 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005261 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005262 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005263 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005264 if (wlen) {
5265 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005266 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005267 *wstr = unicode;
5268 return 0;
5269}
5270
Victor Stinner5f9cf232019-03-19 01:46:25 +01005271
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005272wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005273_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5274 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275{
5276 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005277 int res = _Py_DecodeUTF8Ex(arg, arglen,
5278 &wstr, wlen,
5279 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005280 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005281 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5282 assert(res != -3);
5283 if (wlen) {
5284 *wlen = (size_t)res;
5285 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005286 return NULL;
5287 }
5288 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005289}
5290
Antoine Pitrouab868312009-01-10 15:40:25 +00005291
Victor Stinnere47e6982017-12-21 15:45:16 +01005292/* UTF-8 encoder using the surrogateescape error handler .
5293
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005294 On success, return 0 and write the newly allocated character string (use
5295 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005296
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005297 On encoding failure, return -2 and write the position of the invalid
5298 surrogate character into *error_pos (if error_pos is set) and the decoding
5299 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005300
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005301 On memory allocation failure, return -1. */
5302int
5303_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005304 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005305{
5306 const Py_ssize_t max_char_size = 4;
5307 Py_ssize_t len = wcslen(text);
5308
5309 assert(len >= 0);
5310
Victor Stinner3d4226a2018-08-29 22:21:32 +02005311 int surrogateescape = 0;
5312 int surrogatepass = 0;
5313 switch (errors)
5314 {
5315 case _Py_ERROR_STRICT:
5316 break;
5317 case _Py_ERROR_SURROGATEESCAPE:
5318 surrogateescape = 1;
5319 break;
5320 case _Py_ERROR_SURROGATEPASS:
5321 surrogatepass = 1;
5322 break;
5323 default:
5324 return -3;
5325 }
5326
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005327 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5328 return -1;
5329 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005330 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005331 if (raw_malloc) {
5332 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005333 }
5334 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005335 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005336 }
5337 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005338 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005339 }
5340
5341 char *p = bytes;
5342 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005343 for (i = 0; i < len; ) {
5344 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005345 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005346 i++;
5347#if Py_UNICODE_SIZE == 2
5348 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5349 && i < len
5350 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5351 {
5352 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5353 i++;
5354 }
5355#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005356
5357 if (ch < 0x80) {
5358 /* Encode ASCII */
5359 *p++ = (char) ch;
5360
5361 }
5362 else if (ch < 0x0800) {
5363 /* Encode Latin-1 */
5364 *p++ = (char)(0xc0 | (ch >> 6));
5365 *p++ = (char)(0x80 | (ch & 0x3f));
5366 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005367 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005368 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005369 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005370 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005371 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005372 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005373 if (reason != NULL) {
5374 *reason = "encoding error";
5375 }
5376 if (raw_malloc) {
5377 PyMem_RawFree(bytes);
5378 }
5379 else {
5380 PyMem_Free(bytes);
5381 }
5382 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005383 }
5384 *p++ = (char)(ch & 0xff);
5385 }
5386 else if (ch < 0x10000) {
5387 *p++ = (char)(0xe0 | (ch >> 12));
5388 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5389 *p++ = (char)(0x80 | (ch & 0x3f));
5390 }
5391 else { /* ch >= 0x10000 */
5392 assert(ch <= MAX_UNICODE);
5393 /* Encode UCS4 Unicode ordinals */
5394 *p++ = (char)(0xf0 | (ch >> 18));
5395 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5396 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5397 *p++ = (char)(0x80 | (ch & 0x3f));
5398 }
5399 }
5400 *p++ = '\0';
5401
5402 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005403 char *bytes2;
5404 if (raw_malloc) {
5405 bytes2 = PyMem_RawRealloc(bytes, final_size);
5406 }
5407 else {
5408 bytes2 = PyMem_Realloc(bytes, final_size);
5409 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005410 if (bytes2 == NULL) {
5411 if (error_pos != NULL) {
5412 *error_pos = (size_t)-1;
5413 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005414 if (raw_malloc) {
5415 PyMem_RawFree(bytes);
5416 }
5417 else {
5418 PyMem_Free(bytes);
5419 }
5420 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005421 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005422 *str = bytes2;
5423 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005424}
5425
5426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427/* Primary internal function which creates utf8 encoded bytes objects.
5428
5429 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005430 and allocate exactly as much space needed at the end. Else allocate the
5431 maximum possible needed (4 result bytes per Unicode character), and return
5432 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005433*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005434static PyObject *
5435unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5436 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 if (!PyUnicode_Check(unicode)) {
5439 PyErr_BadArgument();
5440 return NULL;
5441 }
5442
5443 if (PyUnicode_READY(unicode) == -1)
5444 return NULL;
5445
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005446 if (PyUnicode_UTF8(unicode))
5447 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5448 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449
Inada Naoki02a4d572020-02-27 13:48:59 +09005450 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005451 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005452 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5453
5454 _PyBytesWriter writer;
5455 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005456
Benjamin Petersonead6b532011-12-20 17:23:42 -06005457 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005458 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005459 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005460 case PyUnicode_1BYTE_KIND:
5461 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5462 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005463 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5464 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005465 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005466 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5467 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005468 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005469 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5470 break;
Tim Peters602f7402002-04-27 18:03:26 +00005471 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005472
5473 if (end == NULL) {
5474 _PyBytesWriter_Dealloc(&writer);
5475 return NULL;
5476 }
5477 return _PyBytesWriter_Finish(&writer, end);
5478}
5479
5480static int
5481unicode_fill_utf8(PyObject *unicode)
5482{
5483 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5484 assert(!PyUnicode_IS_ASCII(unicode));
5485
5486 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005487 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005488 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5489
5490 _PyBytesWriter writer;
5491 char *end;
5492
5493 switch (kind) {
5494 default:
5495 Py_UNREACHABLE();
5496 case PyUnicode_1BYTE_KIND:
5497 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5498 _Py_ERROR_STRICT, NULL);
5499 break;
5500 case PyUnicode_2BYTE_KIND:
5501 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5502 _Py_ERROR_STRICT, NULL);
5503 break;
5504 case PyUnicode_4BYTE_KIND:
5505 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5506 _Py_ERROR_STRICT, NULL);
5507 break;
5508 }
5509 if (end == NULL) {
5510 _PyBytesWriter_Dealloc(&writer);
5511 return -1;
5512 }
5513
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005514 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005515 PyBytes_AS_STRING(writer.buffer);
5516 Py_ssize_t len = end - start;
5517
5518 char *cache = PyObject_MALLOC(len + 1);
5519 if (cache == NULL) {
5520 _PyBytesWriter_Dealloc(&writer);
5521 PyErr_NoMemory();
5522 return -1;
5523 }
5524 _PyUnicode_UTF8(unicode) = cache;
5525 _PyUnicode_UTF8_LENGTH(unicode) = len;
5526 memcpy(cache, start, len);
5527 cache[len] = '\0';
5528 _PyBytesWriter_Dealloc(&writer);
5529 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530}
5531
Alexander Belopolsky40018472011-02-26 01:02:56 +00005532PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005533_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5534{
5535 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5536}
5537
5538
5539PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5541 Py_ssize_t size,
5542 const char *errors)
5543{
5544 PyObject *v, *unicode;
5545
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005546 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547 if (unicode == NULL)
5548 return NULL;
5549 v = _PyUnicode_AsUTF8String(unicode, errors);
5550 Py_DECREF(unicode);
5551 return v;
5552}
5553
5554PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005555PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558}
5559
Walter Dörwald41980ca2007-08-16 21:55:45 +00005560/* --- UTF-32 Codec ------------------------------------------------------- */
5561
5562PyObject *
5563PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 Py_ssize_t size,
5565 const char *errors,
5566 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005567{
5568 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5569}
5570
5571PyObject *
5572PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 Py_ssize_t size,
5574 const char *errors,
5575 int *byteorder,
5576 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005577{
5578 const char *starts = s;
5579 Py_ssize_t startinpos;
5580 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005581 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005582 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005583 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005584 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005585 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 PyObject *errorHandler = NULL;
5587 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005588
Andy Lestere6be9b52020-02-11 20:28:35 -06005589 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005590 e = q + size;
5591
5592 if (byteorder)
5593 bo = *byteorder;
5594
5595 /* Check for BOM marks (U+FEFF) in the input and adjust current
5596 byte order setting accordingly. In native mode, the leading BOM
5597 mark is skipped, in all other modes, it is copied to the output
5598 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005599 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005600 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005601 if (bom == 0x0000FEFF) {
5602 bo = -1;
5603 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005605 else if (bom == 0xFFFE0000) {
5606 bo = 1;
5607 q += 4;
5608 }
5609 if (byteorder)
5610 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005611 }
5612
Victor Stinnere64322e2012-10-30 23:12:47 +01005613 if (q == e) {
5614 if (consumed)
5615 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005616 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005617 }
5618
Victor Stinnere64322e2012-10-30 23:12:47 +01005619#ifdef WORDS_BIGENDIAN
5620 le = bo < 0;
5621#else
5622 le = bo <= 0;
5623#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005624 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005625
Victor Stinner8f674cc2013-04-17 23:02:17 +02005626 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005627 writer.min_length = (e - q + 3) / 4;
5628 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005630
Victor Stinnere64322e2012-10-30 23:12:47 +01005631 while (1) {
5632 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005634
Victor Stinnere64322e2012-10-30 23:12:47 +01005635 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005636 enum PyUnicode_Kind kind = writer.kind;
5637 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005638 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005640 if (le) {
5641 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005642 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005643 if (ch > maxch)
5644 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005645 if (kind != PyUnicode_1BYTE_KIND &&
5646 Py_UNICODE_IS_SURROGATE(ch))
5647 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005648 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005649 q += 4;
5650 } while (q <= last);
5651 }
5652 else {
5653 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005654 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005655 if (ch > maxch)
5656 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005657 if (kind != PyUnicode_1BYTE_KIND &&
5658 Py_UNICODE_IS_SURROGATE(ch))
5659 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005660 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005661 q += 4;
5662 } while (q <= last);
5663 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005664 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005665 }
5666
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005668 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005669 startinpos = ((const char *)q) - starts;
5670 endinpos = startinpos + 4;
5671 }
5672 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005673 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005675 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005677 startinpos = ((const char *)q) - starts;
5678 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005680 else {
5681 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005682 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005683 goto onError;
5684 q += 4;
5685 continue;
5686 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005687 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005688 startinpos = ((const char *)q) - starts;
5689 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005691
5692 /* The remaining input chars are ignored if the callback
5693 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005694 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005696 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005698 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005700 }
5701
Walter Dörwald41980ca2007-08-16 21:55:45 +00005702 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005704
Walter Dörwald41980ca2007-08-16 21:55:45 +00005705 Py_XDECREF(errorHandler);
5706 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005708
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005711 Py_XDECREF(errorHandler);
5712 Py_XDECREF(exc);
5713 return NULL;
5714}
5715
5716PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005717_PyUnicode_EncodeUTF32(PyObject *str,
5718 const char *errors,
5719 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005720{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005721 enum PyUnicode_Kind kind;
5722 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005723 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005724 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005725 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005726#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005727 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005728#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005729 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005730#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005732 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733 PyObject *errorHandler = NULL;
5734 PyObject *exc = NULL;
5735 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005736
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737 if (!PyUnicode_Check(str)) {
5738 PyErr_BadArgument();
5739 return NULL;
5740 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005741 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005742 return NULL;
5743 kind = PyUnicode_KIND(str);
5744 data = PyUnicode_DATA(str);
5745 len = PyUnicode_GET_LENGTH(str);
5746
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005747 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005748 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005749 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005750 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005751 if (v == NULL)
5752 return NULL;
5753
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005754 /* output buffer is 4-bytes aligned */
5755 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005756 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005757 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005758 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005759 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005760 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005761
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005762 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005764 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005766 else
5767 encoding = "utf-32";
5768
5769 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005770 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5771 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005772 }
5773
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005774 pos = 0;
5775 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005776 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005777
5778 if (kind == PyUnicode_2BYTE_KIND) {
5779 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5780 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005782 else {
5783 assert(kind == PyUnicode_4BYTE_KIND);
5784 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5785 &out, native_ordering);
5786 }
5787 if (pos == len)
5788 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005789
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790 rep = unicode_encode_call_errorhandler(
5791 errors, &errorHandler,
5792 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005793 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005794 if (!rep)
5795 goto error;
5796
5797 if (PyBytes_Check(rep)) {
5798 repsize = PyBytes_GET_SIZE(rep);
5799 if (repsize & 3) {
5800 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005801 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005802 "surrogates not allowed");
5803 goto error;
5804 }
5805 moreunits = repsize / 4;
5806 }
5807 else {
5808 assert(PyUnicode_Check(rep));
5809 if (PyUnicode_READY(rep) < 0)
5810 goto error;
5811 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5812 if (!PyUnicode_IS_ASCII(rep)) {
5813 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005814 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005815 "surrogates not allowed");
5816 goto error;
5817 }
5818 }
5819
5820 /* four bytes are reserved for each surrogate */
5821 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005822 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005823 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005824 /* integer overflow */
5825 PyErr_NoMemory();
5826 goto error;
5827 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005828 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005829 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005830 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005831 }
5832
5833 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005834 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005835 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005837 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005838 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5839 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005840 }
5841
5842 Py_CLEAR(rep);
5843 }
5844
5845 /* Cut back to size actually needed. This is necessary for, for example,
5846 encoding of a string containing isolated surrogates and the 'ignore'
5847 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005848 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005849 if (nsize != PyBytes_GET_SIZE(v))
5850 _PyBytes_Resize(&v, nsize);
5851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005853 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 error:
5856 Py_XDECREF(rep);
5857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
5859 Py_XDECREF(v);
5860 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005861}
5862
Alexander Belopolsky40018472011-02-26 01:02:56 +00005863PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005864PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5865 Py_ssize_t size,
5866 const char *errors,
5867 int byteorder)
5868{
5869 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005870 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 if (tmp == NULL)
5872 return NULL;
5873 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5874 Py_DECREF(tmp);
5875 return result;
5876}
5877
5878PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005879PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005880{
Victor Stinnerb960b342011-11-20 19:12:52 +01005881 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005882}
5883
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884/* --- UTF-16 Codec ------------------------------------------------------- */
5885
Tim Peters772747b2001-08-09 22:21:55 +00005886PyObject *
5887PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 Py_ssize_t size,
5889 const char *errors,
5890 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891{
Walter Dörwald69652032004-09-07 20:24:22 +00005892 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5893}
5894
5895PyObject *
5896PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 Py_ssize_t size,
5898 const char *errors,
5899 int *byteorder,
5900 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005901{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005903 Py_ssize_t startinpos;
5904 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005906 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005907 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005908 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005909 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910 PyObject *errorHandler = NULL;
5911 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005912 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
Andy Lestere6be9b52020-02-11 20:28:35 -06005914 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005915 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
5917 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005918 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005920 /* Check for BOM marks (U+FEFF) in the input and adjust current
5921 byte order setting accordingly. In native mode, the leading BOM
5922 mark is skipped, in all other modes, it is copied to the output
5923 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005924 if (bo == 0 && size >= 2) {
5925 const Py_UCS4 bom = (q[1] << 8) | q[0];
5926 if (bom == 0xFEFF) {
5927 q += 2;
5928 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005930 else if (bom == 0xFFFE) {
5931 q += 2;
5932 bo = 1;
5933 }
5934 if (byteorder)
5935 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005936 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
Antoine Pitrou63065d72012-05-15 23:48:04 +02005938 if (q == e) {
5939 if (consumed)
5940 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005941 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005942 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005943
Christian Heimes743e0cd2012-10-17 23:52:17 +02005944#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005945 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005946 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005947#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005948 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005949 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005950#endif
Tim Peters772747b2001-08-09 22:21:55 +00005951
Antoine Pitrou63065d72012-05-15 23:48:04 +02005952 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005953 character count normally. Error handler will take care of
5954 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005955 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005956 writer.min_length = (e - q + 1) / 2;
5957 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005958 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005959
Antoine Pitrou63065d72012-05-15 23:48:04 +02005960 while (1) {
5961 Py_UCS4 ch = 0;
5962 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005963 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005964 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005965 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005966 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005967 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005968 native_ordering);
5969 else
5970 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005971 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005972 native_ordering);
5973 } else if (kind == PyUnicode_2BYTE_KIND) {
5974 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005975 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005976 native_ordering);
5977 } else {
5978 assert(kind == PyUnicode_4BYTE_KIND);
5979 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005980 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005981 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005982 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984
Antoine Pitrou63065d72012-05-15 23:48:04 +02005985 switch (ch)
5986 {
5987 case 0:
5988 /* remaining byte at the end? (size should be even) */
5989 if (q == e || consumed)
5990 goto End;
5991 errmsg = "truncated data";
5992 startinpos = ((const char *)q) - starts;
5993 endinpos = ((const char *)e) - starts;
5994 break;
5995 /* The remaining input chars are ignored if the callback
5996 chooses to skip the input */
5997 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005998 q -= 2;
5999 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006000 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006001 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006002 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006003 endinpos = ((const char *)e) - starts;
6004 break;
6005 case 2:
6006 errmsg = "illegal encoding";
6007 startinpos = ((const char *)q) - 2 - starts;
6008 endinpos = startinpos + 2;
6009 break;
6010 case 3:
6011 errmsg = "illegal UTF-16 surrogate";
6012 startinpos = ((const char *)q) - 4 - starts;
6013 endinpos = startinpos + 2;
6014 break;
6015 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006016 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006017 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 continue;
6019 }
6020
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006021 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006022 errors,
6023 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006024 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006025 &starts,
6026 (const char **)&e,
6027 &startinpos,
6028 &endinpos,
6029 &exc,
6030 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006031 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 }
6034
Antoine Pitrou63065d72012-05-15 23:48:04 +02006035End:
Walter Dörwald69652032004-09-07 20:24:22 +00006036 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 Py_XDECREF(errorHandler);
6040 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006041 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006044 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 Py_XDECREF(errorHandler);
6046 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 return NULL;
6048}
6049
Tim Peters772747b2001-08-09 22:21:55 +00006050PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006051_PyUnicode_EncodeUTF16(PyObject *str,
6052 const char *errors,
6053 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006055 enum PyUnicode_Kind kind;
6056 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006058 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006059 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006060 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006061#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006062 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006063#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006064 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006065#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006066 const char *encoding;
6067 Py_ssize_t nsize, pos;
6068 PyObject *errorHandler = NULL;
6069 PyObject *exc = NULL;
6070 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006071
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006072 if (!PyUnicode_Check(str)) {
6073 PyErr_BadArgument();
6074 return NULL;
6075 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006076 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006077 return NULL;
6078 kind = PyUnicode_KIND(str);
6079 data = PyUnicode_DATA(str);
6080 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006081
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006082 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006083 if (kind == PyUnicode_4BYTE_KIND) {
6084 const Py_UCS4 *in = (const Py_UCS4 *)data;
6085 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006086 while (in < end) {
6087 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006088 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006089 }
6090 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006091 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006092 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006094 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006095 nsize = len + pairs + (byteorder == 0);
6096 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006097 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006099 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006101 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006102 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006103 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006104 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006105 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006106 }
6107 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006108 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006109 }
Tim Peters772747b2001-08-09 22:21:55 +00006110
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006111 if (kind == PyUnicode_1BYTE_KIND) {
6112 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6113 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006114 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006115
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006116 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006117 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006118 }
6119 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006120 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006121 }
6122 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006123 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006124 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006125
6126 pos = 0;
6127 while (pos < len) {
6128 Py_ssize_t repsize, moreunits;
6129
6130 if (kind == PyUnicode_2BYTE_KIND) {
6131 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6132 &out, native_ordering);
6133 }
6134 else {
6135 assert(kind == PyUnicode_4BYTE_KIND);
6136 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6137 &out, native_ordering);
6138 }
6139 if (pos == len)
6140 break;
6141
6142 rep = unicode_encode_call_errorhandler(
6143 errors, &errorHandler,
6144 encoding, "surrogates not allowed",
6145 str, &exc, pos, pos + 1, &pos);
6146 if (!rep)
6147 goto error;
6148
6149 if (PyBytes_Check(rep)) {
6150 repsize = PyBytes_GET_SIZE(rep);
6151 if (repsize & 1) {
6152 raise_encode_exception(&exc, encoding,
6153 str, pos - 1, pos,
6154 "surrogates not allowed");
6155 goto error;
6156 }
6157 moreunits = repsize / 2;
6158 }
6159 else {
6160 assert(PyUnicode_Check(rep));
6161 if (PyUnicode_READY(rep) < 0)
6162 goto error;
6163 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6164 if (!PyUnicode_IS_ASCII(rep)) {
6165 raise_encode_exception(&exc, encoding,
6166 str, pos - 1, pos,
6167 "surrogates not allowed");
6168 goto error;
6169 }
6170 }
6171
6172 /* two bytes are reserved for each surrogate */
6173 if (moreunits > 1) {
6174 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006175 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006176 /* integer overflow */
6177 PyErr_NoMemory();
6178 goto error;
6179 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006180 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006181 goto error;
6182 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6183 }
6184
6185 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006186 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006187 out += moreunits;
6188 } else /* rep is unicode */ {
6189 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6190 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6191 &out, native_ordering);
6192 }
6193
6194 Py_CLEAR(rep);
6195 }
6196
6197 /* Cut back to size actually needed. This is necessary for, for example,
6198 encoding of a string containing isolated surrogates and the 'ignore' handler
6199 is used. */
6200 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6201 if (nsize != PyBytes_GET_SIZE(v))
6202 _PyBytes_Resize(&v, nsize);
6203 Py_XDECREF(errorHandler);
6204 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006205 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006206 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006207 error:
6208 Py_XDECREF(rep);
6209 Py_XDECREF(errorHandler);
6210 Py_XDECREF(exc);
6211 Py_XDECREF(v);
6212 return NULL;
6213#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214}
6215
Alexander Belopolsky40018472011-02-26 01:02:56 +00006216PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6218 Py_ssize_t size,
6219 const char *errors,
6220 int byteorder)
6221{
6222 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006223 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 if (tmp == NULL)
6225 return NULL;
6226 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6227 Py_DECREF(tmp);
6228 return result;
6229}
6230
6231PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006232PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006234 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235}
6236
6237/* --- Unicode Escape Codec ----------------------------------------------- */
6238
Fredrik Lundh06d12682001-01-24 07:59:11 +00006239static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006240
Alexander Belopolsky40018472011-02-26 01:02:56 +00006241PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006242_PyUnicode_DecodeUnicodeEscape(const char *s,
6243 Py_ssize_t size,
6244 const char *errors,
6245 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006247 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006248 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250 PyObject *errorHandler = NULL;
6251 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006252
Eric V. Smith42454af2016-10-31 09:22:08 -04006253 // so we can remember if we've seen an invalid escape char or not
6254 *first_invalid_escape = NULL;
6255
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006257 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 }
6259 /* Escaped strings will always be longer than the resulting
6260 Unicode string, so we start with size here and then reduce the
6261 length after conversion to the true value.
6262 (but if the error callback returns a long replacement string
6263 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006264 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 writer.min_length = size;
6266 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6267 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006268 }
6269
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 end = s + size;
6271 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 unsigned char c = (unsigned char) *s++;
6273 Py_UCS4 ch;
6274 int count;
6275 Py_ssize_t startinpos;
6276 Py_ssize_t endinpos;
6277 const char *message;
6278
6279#define WRITE_ASCII_CHAR(ch) \
6280 do { \
6281 assert(ch <= 127); \
6282 assert(writer.pos < writer.size); \
6283 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6284 } while(0)
6285
6286#define WRITE_CHAR(ch) \
6287 do { \
6288 if (ch <= writer.maxchar) { \
6289 assert(writer.pos < writer.size); \
6290 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6291 } \
6292 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6293 goto onError; \
6294 } \
6295 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296
6297 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 if (c != '\\') {
6299 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 continue;
6301 }
6302
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 if (s >= end) {
6306 message = "\\ at end of string";
6307 goto error;
6308 }
6309 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006310
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006312 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 case '\n': continue;
6316 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6317 case '\'': WRITE_ASCII_CHAR('\''); continue;
6318 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6319 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006320 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6322 case 't': WRITE_ASCII_CHAR('\t'); continue;
6323 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6324 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006325 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006327 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006328 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 case '0': case '1': case '2': case '3':
6332 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006334 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 ch = (ch<<3) + *s++ - '0';
6336 if (s < end && '0' <= *s && *s <= '7') {
6337 ch = (ch<<3) + *s++ - '0';
6338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 WRITE_CHAR(ch);
6341 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 /* hex escapes */
6344 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006347 message = "truncated \\xXX escape";
6348 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006352 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006353 message = "truncated \\uXXXX escape";
6354 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006357 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006358 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006359 message = "truncated \\UXXXXXXXX escape";
6360 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006361 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006362 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 ch <<= 4;
6364 if (c >= '0' && c <= '9') {
6365 ch += c - '0';
6366 }
6367 else if (c >= 'a' && c <= 'f') {
6368 ch += c - ('a' - 10);
6369 }
6370 else if (c >= 'A' && c <= 'F') {
6371 ch += c - ('A' - 10);
6372 }
6373 else {
6374 break;
6375 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006376 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006378 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 }
6380
6381 /* when we get here, ch is a 32-bit unicode character */
6382 if (ch > MAX_UNICODE) {
6383 message = "illegal Unicode character";
6384 goto error;
6385 }
6386
6387 WRITE_CHAR(ch);
6388 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006389
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006391 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006392 if (ucnhash_CAPI == NULL) {
6393 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006394 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6395 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 if (ucnhash_CAPI == NULL) {
6397 PyErr_SetString(
6398 PyExc_UnicodeError,
6399 "\\N escapes not supported (can't load unicodedata module)"
6400 );
6401 goto onError;
6402 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006403 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006404
6405 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006406 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 const char *start = ++s;
6408 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006409 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006411 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 namelen = s - start;
6413 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006414 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006415 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 ch = 0xffffffff; /* in case 'getcode' messes up */
6417 if (namelen <= INT_MAX &&
6418 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6419 &ch, 0)) {
6420 assert(ch <= MAX_UNICODE);
6421 WRITE_CHAR(ch);
6422 continue;
6423 }
6424 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006425 }
6426 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006427 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006428
6429 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006430 if (*first_invalid_escape == NULL) {
6431 *first_invalid_escape = s-1; /* Back up one char, since we've
6432 already incremented s. */
6433 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 WRITE_ASCII_CHAR('\\');
6435 WRITE_CHAR(c);
6436 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006438
6439 error:
6440 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006442 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006443 errors, &errorHandler,
6444 "unicodeescape", message,
6445 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006446 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006447 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006449 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006450
6451#undef WRITE_ASCII_CHAR
6452#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006454
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006455 Py_XDECREF(errorHandler);
6456 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006457 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006458
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006460 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 Py_XDECREF(errorHandler);
6462 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 return NULL;
6464}
6465
Eric V. Smith42454af2016-10-31 09:22:08 -04006466PyObject *
6467PyUnicode_DecodeUnicodeEscape(const char *s,
6468 Py_ssize_t size,
6469 const char *errors)
6470{
6471 const char *first_invalid_escape;
6472 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6473 &first_invalid_escape);
6474 if (result == NULL)
6475 return NULL;
6476 if (first_invalid_escape != NULL) {
6477 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6478 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006479 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006480 Py_DECREF(result);
6481 return NULL;
6482 }
6483 }
6484 return result;
6485}
6486
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006487/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006490PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006493 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006496 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
Ezio Melottie7f90372012-10-05 03:33:31 +03006499 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006500 escape.
6501
Ezio Melottie7f90372012-10-05 03:33:31 +03006502 For UCS1 strings it's '\xxx', 4 bytes per source character.
6503 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6504 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006505 */
6506
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006507 if (!PyUnicode_Check(unicode)) {
6508 PyErr_BadArgument();
6509 return NULL;
6510 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006511 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006512 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006513 }
Victor Stinner358af132015-10-12 22:36:57 +02006514
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006515 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 if (len == 0) {
6517 return PyBytes_FromStringAndSize(NULL, 0);
6518 }
6519
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006520 kind = PyUnicode_KIND(unicode);
6521 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6523 bytes, and 1 byte characters 4. */
6524 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006525 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006526 return PyErr_NoMemory();
6527 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006528 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 if (repr == NULL) {
6530 return NULL;
6531 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006532
Victor Stinner62ec3312016-09-06 17:04:34 -07006533 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006534 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006535 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006536
Victor Stinner62ec3312016-09-06 17:04:34 -07006537 /* U+0000-U+00ff range */
6538 if (ch < 0x100) {
6539 if (ch >= ' ' && ch < 127) {
6540 if (ch != '\\') {
6541 /* Copy printable US ASCII as-is */
6542 *p++ = (char) ch;
6543 }
6544 /* Escape backslashes */
6545 else {
6546 *p++ = '\\';
6547 *p++ = '\\';
6548 }
6549 }
Victor Stinner358af132015-10-12 22:36:57 +02006550
Victor Stinner62ec3312016-09-06 17:04:34 -07006551 /* Map special whitespace to '\t', \n', '\r' */
6552 else if (ch == '\t') {
6553 *p++ = '\\';
6554 *p++ = 't';
6555 }
6556 else if (ch == '\n') {
6557 *p++ = '\\';
6558 *p++ = 'n';
6559 }
6560 else if (ch == '\r') {
6561 *p++ = '\\';
6562 *p++ = 'r';
6563 }
6564
6565 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6566 else {
6567 *p++ = '\\';
6568 *p++ = 'x';
6569 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6570 *p++ = Py_hexdigits[ch & 0x000F];
6571 }
Tim Petersced69f82003-09-16 20:30:58 +00006572 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006573 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006574 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 *p++ = '\\';
6576 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006577 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6578 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6579 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6580 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006582 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6583 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006584
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 /* Make sure that the first two digits are zero */
6586 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006587 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 *p++ = 'U';
6589 *p++ = '0';
6590 *p++ = '0';
6591 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6592 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6593 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6594 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6595 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6596 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006597 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
Victor Stinner62ec3312016-09-06 17:04:34 -07006600 assert(p - PyBytes_AS_STRING(repr) > 0);
6601 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6602 return NULL;
6603 }
6604 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605}
6606
Alexander Belopolsky40018472011-02-26 01:02:56 +00006607PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006608PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6609 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006611 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006612 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006615 }
6616
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006617 result = PyUnicode_AsUnicodeEscapeString(tmp);
6618 Py_DECREF(tmp);
6619 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620}
6621
6622/* --- Raw Unicode Escape Codec ------------------------------------------- */
6623
Alexander Belopolsky40018472011-02-26 01:02:56 +00006624PyObject *
6625PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006626 Py_ssize_t size,
6627 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006630 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006632 PyObject *errorHandler = NULL;
6633 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006634
Victor Stinner62ec3312016-09-06 17:04:34 -07006635 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006636 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006637 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006638
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 /* Escaped strings will always be longer than the resulting
6640 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641 length after conversion to the true value. (But decoding error
6642 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006643 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006644 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6646 goto onError;
6647 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006648
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 end = s + size;
6650 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 unsigned char c = (unsigned char) *s++;
6652 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006653 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006654 Py_ssize_t startinpos;
6655 Py_ssize_t endinpos;
6656 const char *message;
6657
6658#define WRITE_CHAR(ch) \
6659 do { \
6660 if (ch <= writer.maxchar) { \
6661 assert(writer.pos < writer.size); \
6662 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6663 } \
6664 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6665 goto onError; \
6666 } \
6667 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006670 if (c != '\\' || s >= end) {
6671 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006673 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006674
Victor Stinner62ec3312016-09-06 17:04:34 -07006675 c = (unsigned char) *s++;
6676 if (c == 'u') {
6677 count = 4;
6678 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 else if (c == 'U') {
6681 count = 8;
6682 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006683 }
6684 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006685 assert(writer.pos < writer.size);
6686 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6687 WRITE_CHAR(c);
6688 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006689 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006690 startinpos = s - starts - 2;
6691
6692 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6693 for (ch = 0; count && s < end; ++s, --count) {
6694 c = (unsigned char)*s;
6695 ch <<= 4;
6696 if (c >= '0' && c <= '9') {
6697 ch += c - '0';
6698 }
6699 else if (c >= 'a' && c <= 'f') {
6700 ch += c - ('a' - 10);
6701 }
6702 else if (c >= 'A' && c <= 'F') {
6703 ch += c - ('A' - 10);
6704 }
6705 else {
6706 break;
6707 }
6708 }
6709 if (!count) {
6710 if (ch <= MAX_UNICODE) {
6711 WRITE_CHAR(ch);
6712 continue;
6713 }
6714 message = "\\Uxxxxxxxx out of range";
6715 }
6716
6717 endinpos = s-starts;
6718 writer.min_length = end - s + writer.pos;
6719 if (unicode_decode_call_errorhandler_writer(
6720 errors, &errorHandler,
6721 "rawunicodeescape", message,
6722 &starts, &end, &startinpos, &endinpos, &exc, &s,
6723 &writer)) {
6724 goto onError;
6725 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006726 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006727
6728#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 Py_XDECREF(errorHandler);
6731 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006733
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006735 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 Py_XDECREF(errorHandler);
6737 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006742
Alexander Belopolsky40018472011-02-26 01:02:56 +00006743PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006744PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745{
Victor Stinner62ec3312016-09-06 17:04:34 -07006746 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006748 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006749 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006750 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006751 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006753 if (!PyUnicode_Check(unicode)) {
6754 PyErr_BadArgument();
6755 return NULL;
6756 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006757 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006758 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006759 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006760 kind = PyUnicode_KIND(unicode);
6761 data = PyUnicode_DATA(unicode);
6762 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006763 if (kind == PyUnicode_1BYTE_KIND) {
6764 return PyBytes_FromStringAndSize(data, len);
6765 }
Victor Stinner0e368262011-11-10 20:12:49 +01006766
Victor Stinner62ec3312016-09-06 17:04:34 -07006767 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6768 bytes, and 1 byte characters 4. */
6769 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006770
Victor Stinner62ec3312016-09-06 17:04:34 -07006771 if (len > PY_SSIZE_T_MAX / expandsize) {
6772 return PyErr_NoMemory();
6773 }
6774 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6775 if (repr == NULL) {
6776 return NULL;
6777 }
6778 if (len == 0) {
6779 return repr;
6780 }
6781
6782 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006783 for (pos = 0; pos < len; pos++) {
6784 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006785
Victor Stinner62ec3312016-09-06 17:04:34 -07006786 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6787 if (ch < 0x100) {
6788 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006789 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006790 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006791 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 *p++ = '\\';
6793 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006794 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6795 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6796 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6797 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006798 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006799 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6800 else {
6801 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6802 *p++ = '\\';
6803 *p++ = 'U';
6804 *p++ = '0';
6805 *p++ = '0';
6806 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6807 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6808 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6809 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6810 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6811 *p++ = Py_hexdigits[ch & 15];
6812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006814
Victor Stinner62ec3312016-09-06 17:04:34 -07006815 assert(p > PyBytes_AS_STRING(repr));
6816 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6817 return NULL;
6818 }
6819 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820}
6821
Alexander Belopolsky40018472011-02-26 01:02:56 +00006822PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006823PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6824 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006826 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006827 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006828 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006829 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006830 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6831 Py_DECREF(tmp);
6832 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833}
6834
6835/* --- Latin-1 Codec ------------------------------------------------------ */
6836
Alexander Belopolsky40018472011-02-26 01:02:56 +00006837PyObject *
6838PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006839 Py_ssize_t size,
6840 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006843 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844}
6845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006847static void
6848make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006849 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006850 PyObject *unicode,
6851 Py_ssize_t startpos, Py_ssize_t endpos,
6852 const char *reason)
6853{
6854 if (*exceptionObject == NULL) {
6855 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006856 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006857 encoding, unicode, startpos, endpos, reason);
6858 }
6859 else {
6860 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6861 goto onError;
6862 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6863 goto onError;
6864 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6865 goto onError;
6866 return;
6867 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006868 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006869 }
6870}
6871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006873static void
6874raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006875 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006876 PyObject *unicode,
6877 Py_ssize_t startpos, Py_ssize_t endpos,
6878 const char *reason)
6879{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006880 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006881 encoding, unicode, startpos, endpos, reason);
6882 if (*exceptionObject != NULL)
6883 PyCodec_StrictErrors(*exceptionObject);
6884}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006885
6886/* error handling callback helper:
6887 build arguments, call the callback and check the arguments,
6888 put the result into newpos and return the replacement string, which
6889 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006890static PyObject *
6891unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006892 PyObject **errorHandler,
6893 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006895 Py_ssize_t startpos, Py_ssize_t endpos,
6896 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006897{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006898 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006899 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 PyObject *restuple;
6901 PyObject *resunicode;
6902
6903 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907 }
6908
Benjamin Petersonbac79492012-01-14 13:34:47 -05006909 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006910 return NULL;
6911 len = PyUnicode_GET_LENGTH(unicode);
6912
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006913 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006915 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917
Petr Viktorinffd97532020-02-11 17:46:57 +01006918 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006922 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 Py_DECREF(restuple);
6924 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006925 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006926 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 &resunicode, newpos)) {
6928 Py_DECREF(restuple);
6929 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006930 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006931 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6932 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6933 Py_DECREF(restuple);
6934 return NULL;
6935 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006937 *newpos = len + *newpos;
6938 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006939 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 Py_DECREF(restuple);
6941 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006943 Py_INCREF(resunicode);
6944 Py_DECREF(restuple);
6945 return resunicode;
6946}
6947
Alexander Belopolsky40018472011-02-26 01:02:56 +00006948static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006949unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006950 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006951 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006952{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006953 /* input state */
6954 Py_ssize_t pos=0, size;
6955 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006956 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006957 /* pointer into the output */
6958 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006959 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6960 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006961 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006962 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006963 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006964 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006965 /* output object */
6966 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006967
Benjamin Petersonbac79492012-01-14 13:34:47 -05006968 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006969 return NULL;
6970 size = PyUnicode_GET_LENGTH(unicode);
6971 kind = PyUnicode_KIND(unicode);
6972 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006973 /* allocate enough for a simple encoding without
6974 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006975 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006976 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006977
6978 _PyBytesWriter_Init(&writer);
6979 str = _PyBytesWriter_Alloc(&writer, size);
6980 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006981 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006982
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006983 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006984 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006987 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006989 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006990 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006991 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006993 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006995 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006996 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006998
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006999 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007001
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007002 /* Only overallocate the buffer if it's not the last write */
7003 writer.overallocate = (collend < size);
7004
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007006 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007007 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007008
7009 switch (error_handler) {
7010 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007011 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007013
7014 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007015 memset(str, '?', collend - collstart);
7016 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007017 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007018 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007019 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 break;
Victor Stinner50149202015-09-22 00:26:54 +02007021
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007022 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007023 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007024 writer.min_size -= (collend - collstart);
7025 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007026 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007027 if (str == NULL)
7028 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007029 pos = collend;
7030 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007031
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007032 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007033 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007034 writer.min_size -= (collend - collstart);
7035 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007036 unicode, collstart, collend);
7037 if (str == NULL)
7038 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 break;
Victor Stinner50149202015-09-22 00:26:54 +02007041
Victor Stinnerc3713e92015-09-29 12:32:13 +02007042 case _Py_ERROR_SURROGATEESCAPE:
7043 for (i = collstart; i < collend; ++i) {
7044 ch = PyUnicode_READ(kind, data, i);
7045 if (ch < 0xdc80 || 0xdcff < ch) {
7046 /* Not a UTF-8b surrogate */
7047 break;
7048 }
7049 *str++ = (char)(ch - 0xdc00);
7050 ++pos;
7051 }
7052 if (i >= collend)
7053 break;
7054 collstart = pos;
7055 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007056 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007057
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007059 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7060 encoding, reason, unicode, &exc,
7061 collstart, collend, &newpos);
7062 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007064
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007065 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007066 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007067
Victor Stinner6bd525b2015-10-09 13:10:05 +02007068 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007069 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007070 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007071 PyBytes_AS_STRING(rep),
7072 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007073 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007074 else {
7075 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007076
Victor Stinner6bd525b2015-10-09 13:10:05 +02007077 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007079
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007080 if (limit == 256 ?
7081 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7082 !PyUnicode_IS_ASCII(rep))
7083 {
7084 /* Not all characters are smaller than limit */
7085 raise_encode_exception(&exc, encoding, unicode,
7086 collstart, collend, reason);
7087 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007089 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7090 str = _PyBytesWriter_WriteBytes(&writer, str,
7091 PyUnicode_DATA(rep),
7092 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007094 if (str == NULL)
7095 goto onError;
7096
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007097 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007098 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007099 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007100
7101 /* If overallocation was disabled, ensure that it was the last
7102 write. Otherwise, we missed an optimization */
7103 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007104 }
7105 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007106
Victor Stinner50149202015-09-22 00:26:54 +02007107 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007108 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007109 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007110
7111 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007112 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007113 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007114 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007115 Py_XDECREF(exc);
7116 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007117}
7118
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007119/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007120PyObject *
7121PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007122 Py_ssize_t size,
7123 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007125 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007126 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007127 if (unicode == NULL)
7128 return NULL;
7129 result = unicode_encode_ucs1(unicode, errors, 256);
7130 Py_DECREF(unicode);
7131 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132}
7133
Alexander Belopolsky40018472011-02-26 01:02:56 +00007134PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007135_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
7137 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 PyErr_BadArgument();
7139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007141 if (PyUnicode_READY(unicode) == -1)
7142 return NULL;
7143 /* Fast path: if it is a one-byte string, construct
7144 bytes object directly. */
7145 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7146 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7147 PyUnicode_GET_LENGTH(unicode));
7148 /* Non-Latin-1 characters present. Defer to above function to
7149 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007150 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007151}
7152
7153PyObject*
7154PyUnicode_AsLatin1String(PyObject *unicode)
7155{
7156 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157}
7158
7159/* --- 7-bit ASCII Codec -------------------------------------------------- */
7160
Alexander Belopolsky40018472011-02-26 01:02:56 +00007161PyObject *
7162PyUnicode_DecodeASCII(const char *s,
7163 Py_ssize_t size,
7164 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007166 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007167 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007168 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007169 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007170 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007171
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007173 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007174
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007176 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007177 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007179
Inada Naoki770847a2019-06-24 12:30:24 +09007180 // Shortcut for simple case
7181 PyObject *u = PyUnicode_New(size, 127);
7182 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007183 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007184 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007185 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007186 if (outpos == size) {
7187 return u;
7188 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007189
Inada Naoki770847a2019-06-24 12:30:24 +09007190 _PyUnicodeWriter writer;
7191 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007192 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007193
Inada Naoki770847a2019-06-24 12:30:24 +09007194 s += outpos;
7195 int kind = writer.kind;
7196 void *data = writer.data;
7197 Py_ssize_t startinpos, endinpos;
7198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007200 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007202 PyUnicode_WRITE(kind, data, writer.pos, c);
7203 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007205 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007207
7208 /* byte outsize range 0x00..0x7f: call the error handler */
7209
7210 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007211 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007212
7213 switch (error_handler)
7214 {
7215 case _Py_ERROR_REPLACE:
7216 case _Py_ERROR_SURROGATEESCAPE:
7217 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007218 but we may switch to UCS2 at the first write */
7219 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7220 goto onError;
7221 kind = writer.kind;
7222 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007223
7224 if (error_handler == _Py_ERROR_REPLACE)
7225 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7226 else
7227 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7228 writer.pos++;
7229 ++s;
7230 break;
7231
7232 case _Py_ERROR_IGNORE:
7233 ++s;
7234 break;
7235
7236 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 startinpos = s-starts;
7238 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007239 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007240 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 "ascii", "ordinal not in range(128)",
7242 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007243 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007245 kind = writer.kind;
7246 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007249 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007250 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007251 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007252
Benjamin Peterson29060642009-01-31 22:14:21 +00007253 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007254 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007255 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007256 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 return NULL;
7258}
7259
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007260/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007261PyObject *
7262PyUnicode_EncodeASCII(const Py_UNICODE *p,
7263 Py_ssize_t size,
7264 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007266 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007267 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007268 if (unicode == NULL)
7269 return NULL;
7270 result = unicode_encode_ucs1(unicode, errors, 128);
7271 Py_DECREF(unicode);
7272 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273}
7274
Alexander Belopolsky40018472011-02-26 01:02:56 +00007275PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007276_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277{
7278 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 PyErr_BadArgument();
7280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007282 if (PyUnicode_READY(unicode) == -1)
7283 return NULL;
7284 /* Fast path: if it is an ASCII-only string, construct bytes object
7285 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007286 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007287 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7288 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007289 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290}
7291
7292PyObject *
7293PyUnicode_AsASCIIString(PyObject *unicode)
7294{
7295 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296}
7297
Steve Dowercc16be82016-09-08 10:35:16 -07007298#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007299
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007300/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007301
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007302#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007303#define NEED_RETRY
7304#endif
7305
Steve Dower7ebdda02019-08-21 16:22:33 -07007306/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7307 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7308 both cases also and avoids partial characters overrunning the
7309 length limit in MultiByteToWideChar on Windows */
7310#define DECODING_CHUNK_SIZE (INT_MAX/4)
7311
Victor Stinner3a50e702011-10-18 21:21:00 +02007312#ifndef WC_ERR_INVALID_CHARS
7313# define WC_ERR_INVALID_CHARS 0x0080
7314#endif
7315
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007316static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007317code_page_name(UINT code_page, PyObject **obj)
7318{
7319 *obj = NULL;
7320 if (code_page == CP_ACP)
7321 return "mbcs";
7322 if (code_page == CP_UTF7)
7323 return "CP_UTF7";
7324 if (code_page == CP_UTF8)
7325 return "CP_UTF8";
7326
7327 *obj = PyBytes_FromFormat("cp%u", code_page);
7328 if (*obj == NULL)
7329 return NULL;
7330 return PyBytes_AS_STRING(*obj);
7331}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007332
Victor Stinner3a50e702011-10-18 21:21:00 +02007333static DWORD
7334decode_code_page_flags(UINT code_page)
7335{
7336 if (code_page == CP_UTF7) {
7337 /* The CP_UTF7 decoder only supports flags=0 */
7338 return 0;
7339 }
7340 else
7341 return MB_ERR_INVALID_CHARS;
7342}
7343
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 * Decode a byte string from a Windows code page into unicode object in strict
7346 * mode.
7347 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007348 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7349 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007350 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007351static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007352decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007353 wchar_t **buf,
7354 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 const char *in,
7356 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007357{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007358 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007359 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007360 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007361
7362 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007364 while ((outsize = MultiByteToWideChar(code_page, flags,
7365 in, insize, NULL, 0)) <= 0)
7366 {
7367 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7368 goto error;
7369 }
7370 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7371 flags = 0;
7372 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007373
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007374 /* Extend a wchar_t* buffer */
7375 Py_ssize_t n = *bufsize; /* Get the current length */
7376 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7377 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007378 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007379 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007380
7381 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7383 if (outsize <= 0)
7384 goto error;
7385 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007386
Victor Stinner3a50e702011-10-18 21:21:00 +02007387error:
7388 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7389 return -2;
7390 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007391 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392}
7393
Victor Stinner3a50e702011-10-18 21:21:00 +02007394/*
7395 * Decode a byte string from a code page into unicode object with an error
7396 * handler.
7397 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007398 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 * UnicodeDecodeError exception and returns -1 on error.
7400 */
7401static int
7402decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007403 wchar_t **buf,
7404 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007405 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007406 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007407{
7408 const char *startin = in;
7409 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007410 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 /* Ideally, we should get reason from FormatMessage. This is the Windows
7412 2000 English version of the message. */
7413 const char *reason = "No mapping for the Unicode character exists "
7414 "in the target code page.";
7415 /* each step cannot decode more than 1 character, but a character can be
7416 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007417 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007418 int insize;
7419 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007420 PyObject *errorHandler = NULL;
7421 PyObject *exc = NULL;
7422 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007423 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 DWORD err;
7425 int ret = -1;
7426
7427 assert(size > 0);
7428
7429 encoding = code_page_name(code_page, &encoding_obj);
7430 if (encoding == NULL)
7431 return -1;
7432
Victor Stinner7d00cc12014-03-17 23:08:06 +01007433 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7435 UnicodeDecodeError. */
7436 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7437 if (exc != NULL) {
7438 PyCodec_StrictErrors(exc);
7439 Py_CLEAR(exc);
7440 }
7441 goto error;
7442 }
7443
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007444 /* Extend a wchar_t* buffer */
7445 Py_ssize_t n = *bufsize; /* Get the current length */
7446 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7447 PyErr_NoMemory();
7448 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007450 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7451 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007453 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007454
7455 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 while (in < endin)
7457 {
7458 /* Decode a character */
7459 insize = 1;
7460 do
7461 {
7462 outsize = MultiByteToWideChar(code_page, flags,
7463 in, insize,
7464 buffer, Py_ARRAY_LENGTH(buffer));
7465 if (outsize > 0)
7466 break;
7467 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007468 if (err == ERROR_INVALID_FLAGS && flags) {
7469 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7470 flags = 0;
7471 continue;
7472 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 if (err != ERROR_NO_UNICODE_TRANSLATION
7474 && err != ERROR_INSUFFICIENT_BUFFER)
7475 {
7476 PyErr_SetFromWindowsErr(0);
7477 goto error;
7478 }
7479 insize++;
7480 }
7481 /* 4=maximum length of a UTF-8 sequence */
7482 while (insize <= 4 && (in + insize) <= endin);
7483
7484 if (outsize <= 0) {
7485 Py_ssize_t startinpos, endinpos, outpos;
7486
Victor Stinner7d00cc12014-03-17 23:08:06 +01007487 /* last character in partial decode? */
7488 if (in + insize >= endin && !final)
7489 break;
7490
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 startinpos = in - startin;
7492 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007493 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007494 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 errors, &errorHandler,
7496 encoding, reason,
7497 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007498 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 {
7500 goto error;
7501 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007502 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 }
7504 else {
7505 in += insize;
7506 memcpy(out, buffer, outsize * sizeof(wchar_t));
7507 out += outsize;
7508 }
7509 }
7510
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007511 /* Shrink the buffer */
7512 assert(out - *buf <= *bufsize);
7513 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007514 /* (in - startin) <= size and size is an int */
7515 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007516
7517error:
7518 Py_XDECREF(encoding_obj);
7519 Py_XDECREF(errorHandler);
7520 Py_XDECREF(exc);
7521 return ret;
7522}
7523
Victor Stinner3a50e702011-10-18 21:21:00 +02007524static PyObject *
7525decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007526 const char *s, Py_ssize_t size,
7527 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007528{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007529 wchar_t *buf = NULL;
7530 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007531 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 if (code_page < 0) {
7534 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7535 return NULL;
7536 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007537 if (size < 0) {
7538 PyErr_BadInternalCall();
7539 return NULL;
7540 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007541
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007542 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544
Victor Stinner76a31a62011-11-04 00:05:13 +01007545 do
7546 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007547#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007548 if (size > DECODING_CHUNK_SIZE) {
7549 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007550 final = 0;
7551 done = 0;
7552 }
7553 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007554#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007555 {
7556 chunk_size = (int)size;
7557 final = (consumed == NULL);
7558 done = 1;
7559 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007560
Victor Stinner76a31a62011-11-04 00:05:13 +01007561 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007562 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007563 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007564 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007565 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007566
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007567 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007568 s, chunk_size);
7569 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007570 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007571 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007572 errors, final);
7573 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007574
7575 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007576 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007577 return NULL;
7578 }
7579
7580 if (consumed)
7581 *consumed += converted;
7582
7583 s += converted;
7584 size -= converted;
7585 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007586
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007587 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7588 PyMem_Free(buf);
7589 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007590}
7591
Alexander Belopolsky40018472011-02-26 01:02:56 +00007592PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007593PyUnicode_DecodeCodePageStateful(int code_page,
7594 const char *s,
7595 Py_ssize_t size,
7596 const char *errors,
7597 Py_ssize_t *consumed)
7598{
7599 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7600}
7601
7602PyObject *
7603PyUnicode_DecodeMBCSStateful(const char *s,
7604 Py_ssize_t size,
7605 const char *errors,
7606 Py_ssize_t *consumed)
7607{
7608 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7609}
7610
7611PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007612PyUnicode_DecodeMBCS(const char *s,
7613 Py_ssize_t size,
7614 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007615{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007616 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7617}
7618
Victor Stinner3a50e702011-10-18 21:21:00 +02007619static DWORD
7620encode_code_page_flags(UINT code_page, const char *errors)
7621{
7622 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007623 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 }
7625 else if (code_page == CP_UTF7) {
7626 /* CP_UTF7 only supports flags=0 */
7627 return 0;
7628 }
7629 else {
7630 if (errors != NULL && strcmp(errors, "replace") == 0)
7631 return 0;
7632 else
7633 return WC_NO_BEST_FIT_CHARS;
7634 }
7635}
7636
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007637/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 * Encode a Unicode string to a Windows code page into a byte string in strict
7639 * mode.
7640 *
7641 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007642 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007643 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007644static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007645encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007648{
Victor Stinner554f3f02010-06-16 23:33:54 +00007649 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 BOOL *pusedDefaultChar = &usedDefaultChar;
7651 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007652 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007653 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 const DWORD flags = encode_code_page_flags(code_page, NULL);
7655 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007656 /* Create a substring so that we can get the UTF-16 representation
7657 of just the slice under consideration. */
7658 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007659
Martin v. Löwis3d325192011-11-04 18:23:06 +01007660 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007661
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007663 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007665 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007666
Victor Stinner2fc507f2011-11-04 20:06:39 +01007667 substring = PyUnicode_Substring(unicode, offset, offset+len);
7668 if (substring == NULL)
7669 return -1;
7670 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7671 if (p == NULL) {
7672 Py_DECREF(substring);
7673 return -1;
7674 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007675 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007676
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007677 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007678 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007679 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 NULL, 0,
7681 NULL, pusedDefaultChar);
7682 if (outsize <= 0)
7683 goto error;
7684 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007685 if (pusedDefaultChar && *pusedDefaultChar) {
7686 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007688 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007689
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007693 if (*outbytes == NULL) {
7694 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007696 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007697 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007698 }
7699 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007701 const Py_ssize_t n = PyBytes_Size(*outbytes);
7702 if (outsize > PY_SSIZE_T_MAX - n) {
7703 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007704 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007707 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7708 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007710 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007712 }
7713
7714 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007716 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007717 out, outsize,
7718 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007719 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007720 if (outsize <= 0)
7721 goto error;
7722 if (pusedDefaultChar && *pusedDefaultChar)
7723 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007724 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007725
Victor Stinner3a50e702011-10-18 21:21:00 +02007726error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007727 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007728 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7729 return -2;
7730 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007731 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007732}
7733
Victor Stinner3a50e702011-10-18 21:21:00 +02007734/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007735 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 * error handler.
7737 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007738 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007739 * -1 on other error.
7740 */
7741static int
7742encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007745{
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007747 Py_ssize_t pos = unicode_offset;
7748 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007749 /* Ideally, we should get reason from FormatMessage. This is the Windows
7750 2000 English version of the message. */
7751 const char *reason = "invalid character";
7752 /* 4=maximum length of a UTF-8 sequence */
7753 char buffer[4];
7754 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7755 Py_ssize_t outsize;
7756 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007757 PyObject *errorHandler = NULL;
7758 PyObject *exc = NULL;
7759 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007760 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007761 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 PyObject *rep;
7763 int ret = -1;
7764
7765 assert(insize > 0);
7766
7767 encoding = code_page_name(code_page, &encoding_obj);
7768 if (encoding == NULL)
7769 return -1;
7770
7771 if (errors == NULL || strcmp(errors, "strict") == 0) {
7772 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7773 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007774 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007775 if (exc != NULL) {
7776 PyCodec_StrictErrors(exc);
7777 Py_DECREF(exc);
7778 }
7779 Py_XDECREF(encoding_obj);
7780 return -1;
7781 }
7782
7783 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7784 pusedDefaultChar = &usedDefaultChar;
7785 else
7786 pusedDefaultChar = NULL;
7787
7788 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7789 PyErr_NoMemory();
7790 goto error;
7791 }
7792 outsize = insize * Py_ARRAY_LENGTH(buffer);
7793
7794 if (*outbytes == NULL) {
7795 /* Create string object */
7796 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7797 if (*outbytes == NULL)
7798 goto error;
7799 out = PyBytes_AS_STRING(*outbytes);
7800 }
7801 else {
7802 /* Extend string object */
7803 Py_ssize_t n = PyBytes_Size(*outbytes);
7804 if (n > PY_SSIZE_T_MAX - outsize) {
7805 PyErr_NoMemory();
7806 goto error;
7807 }
7808 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7809 goto error;
7810 out = PyBytes_AS_STRING(*outbytes) + n;
7811 }
7812
7813 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007814 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007815 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007816 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7817 wchar_t chars[2];
7818 int charsize;
7819 if (ch < 0x10000) {
7820 chars[0] = (wchar_t)ch;
7821 charsize = 1;
7822 }
7823 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007824 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7825 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007826 charsize = 2;
7827 }
7828
Victor Stinner3a50e702011-10-18 21:21:00 +02007829 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007830 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 buffer, Py_ARRAY_LENGTH(buffer),
7832 NULL, pusedDefaultChar);
7833 if (outsize > 0) {
7834 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7835 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007836 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007837 memcpy(out, buffer, outsize);
7838 out += outsize;
7839 continue;
7840 }
7841 }
7842 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7843 PyErr_SetFromWindowsErr(0);
7844 goto error;
7845 }
7846
Victor Stinner3a50e702011-10-18 21:21:00 +02007847 rep = unicode_encode_call_errorhandler(
7848 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007849 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007850 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007851 if (rep == NULL)
7852 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007853 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007854
7855 if (PyBytes_Check(rep)) {
7856 outsize = PyBytes_GET_SIZE(rep);
7857 if (outsize != 1) {
7858 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7859 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7860 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7861 Py_DECREF(rep);
7862 goto error;
7863 }
7864 out = PyBytes_AS_STRING(*outbytes) + offset;
7865 }
7866 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7867 out += outsize;
7868 }
7869 else {
7870 Py_ssize_t i;
7871 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007872 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007873
Benjamin Petersonbac79492012-01-14 13:34:47 -05007874 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007875 Py_DECREF(rep);
7876 goto error;
7877 }
7878
7879 outsize = PyUnicode_GET_LENGTH(rep);
7880 if (outsize != 1) {
7881 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7882 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7883 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7884 Py_DECREF(rep);
7885 goto error;
7886 }
7887 out = PyBytes_AS_STRING(*outbytes) + offset;
7888 }
7889 kind = PyUnicode_KIND(rep);
7890 data = PyUnicode_DATA(rep);
7891 for (i=0; i < outsize; i++) {
7892 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7893 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007894 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007895 encoding, unicode,
7896 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007897 "unable to encode error handler result to ASCII");
7898 Py_DECREF(rep);
7899 goto error;
7900 }
7901 *out = (unsigned char)ch;
7902 out++;
7903 }
7904 }
7905 Py_DECREF(rep);
7906 }
7907 /* write a NUL byte */
7908 *out = 0;
7909 outsize = out - PyBytes_AS_STRING(*outbytes);
7910 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7911 if (_PyBytes_Resize(outbytes, outsize) < 0)
7912 goto error;
7913 ret = 0;
7914
7915error:
7916 Py_XDECREF(encoding_obj);
7917 Py_XDECREF(errorHandler);
7918 Py_XDECREF(exc);
7919 return ret;
7920}
7921
Victor Stinner3a50e702011-10-18 21:21:00 +02007922static PyObject *
7923encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007924 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007925 const char *errors)
7926{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007927 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007928 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007929 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007930 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007931
Victor Stinner29dacf22015-01-26 16:41:32 +01007932 if (!PyUnicode_Check(unicode)) {
7933 PyErr_BadArgument();
7934 return NULL;
7935 }
7936
Benjamin Petersonbac79492012-01-14 13:34:47 -05007937 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007938 return NULL;
7939 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007940
Victor Stinner3a50e702011-10-18 21:21:00 +02007941 if (code_page < 0) {
7942 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7943 return NULL;
7944 }
7945
Martin v. Löwis3d325192011-11-04 18:23:06 +01007946 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007947 return PyBytes_FromStringAndSize(NULL, 0);
7948
Victor Stinner7581cef2011-11-03 22:32:33 +01007949 offset = 0;
7950 do
7951 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007952#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007953 if (len > DECODING_CHUNK_SIZE) {
7954 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007955 done = 0;
7956 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007957 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007958#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007959 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007960 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007961 done = 1;
7962 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007963
Victor Stinner76a31a62011-11-04 00:05:13 +01007964 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007965 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007966 errors);
7967 if (ret == -2)
7968 ret = encode_code_page_errors(code_page, &outbytes,
7969 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007970 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007971 if (ret < 0) {
7972 Py_XDECREF(outbytes);
7973 return NULL;
7974 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007975
Victor Stinner7581cef2011-11-03 22:32:33 +01007976 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007977 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007978 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007979
Victor Stinner3a50e702011-10-18 21:21:00 +02007980 return outbytes;
7981}
7982
7983PyObject *
7984PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7985 Py_ssize_t size,
7986 const char *errors)
7987{
Victor Stinner7581cef2011-11-03 22:32:33 +01007988 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007989 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007990 if (unicode == NULL)
7991 return NULL;
7992 res = encode_code_page(CP_ACP, unicode, errors);
7993 Py_DECREF(unicode);
7994 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007995}
7996
7997PyObject *
7998PyUnicode_EncodeCodePage(int code_page,
7999 PyObject *unicode,
8000 const char *errors)
8001{
Victor Stinner7581cef2011-11-03 22:32:33 +01008002 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008003}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008004
Alexander Belopolsky40018472011-02-26 01:02:56 +00008005PyObject *
8006PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008007{
Victor Stinner7581cef2011-11-03 22:32:33 +01008008 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008009}
8010
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008011#undef NEED_RETRY
8012
Steve Dowercc16be82016-09-08 10:35:16 -07008013#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008014
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015/* --- Character Mapping Codec -------------------------------------------- */
8016
Victor Stinnerfb161b12013-04-18 01:44:27 +02008017static int
8018charmap_decode_string(const char *s,
8019 Py_ssize_t size,
8020 PyObject *mapping,
8021 const char *errors,
8022 _PyUnicodeWriter *writer)
8023{
8024 const char *starts = s;
8025 const char *e;
8026 Py_ssize_t startinpos, endinpos;
8027 PyObject *errorHandler = NULL, *exc = NULL;
8028 Py_ssize_t maplen;
8029 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008030 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008031 Py_UCS4 x;
8032 unsigned char ch;
8033
8034 if (PyUnicode_READY(mapping) == -1)
8035 return -1;
8036
8037 maplen = PyUnicode_GET_LENGTH(mapping);
8038 mapdata = PyUnicode_DATA(mapping);
8039 mapkind = PyUnicode_KIND(mapping);
8040
8041 e = s + size;
8042
8043 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8044 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8045 * is disabled in encoding aliases, latin1 is preferred because
8046 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008047 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008048 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8049 Py_UCS4 maxchar = writer->maxchar;
8050
8051 assert (writer->kind == PyUnicode_1BYTE_KIND);
8052 while (s < e) {
8053 ch = *s;
8054 x = mapdata_ucs1[ch];
8055 if (x > maxchar) {
8056 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8057 goto onError;
8058 maxchar = writer->maxchar;
8059 outdata = (Py_UCS1 *)writer->data;
8060 }
8061 outdata[writer->pos] = x;
8062 writer->pos++;
8063 ++s;
8064 }
8065 return 0;
8066 }
8067
8068 while (s < e) {
8069 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8070 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008071 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008072 if (outkind == PyUnicode_1BYTE_KIND) {
8073 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8074 Py_UCS4 maxchar = writer->maxchar;
8075 while (s < e) {
8076 ch = *s;
8077 x = mapdata_ucs2[ch];
8078 if (x > maxchar)
8079 goto Error;
8080 outdata[writer->pos] = x;
8081 writer->pos++;
8082 ++s;
8083 }
8084 break;
8085 }
8086 else if (outkind == PyUnicode_2BYTE_KIND) {
8087 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8088 while (s < e) {
8089 ch = *s;
8090 x = mapdata_ucs2[ch];
8091 if (x == 0xFFFE)
8092 goto Error;
8093 outdata[writer->pos] = x;
8094 writer->pos++;
8095 ++s;
8096 }
8097 break;
8098 }
8099 }
8100 ch = *s;
8101
8102 if (ch < maplen)
8103 x = PyUnicode_READ(mapkind, mapdata, ch);
8104 else
8105 x = 0xfffe; /* invalid value */
8106Error:
8107 if (x == 0xfffe)
8108 {
8109 /* undefined mapping */
8110 startinpos = s-starts;
8111 endinpos = startinpos+1;
8112 if (unicode_decode_call_errorhandler_writer(
8113 errors, &errorHandler,
8114 "charmap", "character maps to <undefined>",
8115 &starts, &e, &startinpos, &endinpos, &exc, &s,
8116 writer)) {
8117 goto onError;
8118 }
8119 continue;
8120 }
8121
8122 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8123 goto onError;
8124 ++s;
8125 }
8126 Py_XDECREF(errorHandler);
8127 Py_XDECREF(exc);
8128 return 0;
8129
8130onError:
8131 Py_XDECREF(errorHandler);
8132 Py_XDECREF(exc);
8133 return -1;
8134}
8135
8136static int
8137charmap_decode_mapping(const char *s,
8138 Py_ssize_t size,
8139 PyObject *mapping,
8140 const char *errors,
8141 _PyUnicodeWriter *writer)
8142{
8143 const char *starts = s;
8144 const char *e;
8145 Py_ssize_t startinpos, endinpos;
8146 PyObject *errorHandler = NULL, *exc = NULL;
8147 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008148 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008149
8150 e = s + size;
8151
8152 while (s < e) {
8153 ch = *s;
8154
8155 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8156 key = PyLong_FromLong((long)ch);
8157 if (key == NULL)
8158 goto onError;
8159
8160 item = PyObject_GetItem(mapping, key);
8161 Py_DECREF(key);
8162 if (item == NULL) {
8163 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8164 /* No mapping found means: mapping is undefined. */
8165 PyErr_Clear();
8166 goto Undefined;
8167 } else
8168 goto onError;
8169 }
8170
8171 /* Apply mapping */
8172 if (item == Py_None)
8173 goto Undefined;
8174 if (PyLong_Check(item)) {
8175 long value = PyLong_AS_LONG(item);
8176 if (value == 0xFFFE)
8177 goto Undefined;
8178 if (value < 0 || value > MAX_UNICODE) {
8179 PyErr_Format(PyExc_TypeError,
8180 "character mapping must be in range(0x%lx)",
8181 (unsigned long)MAX_UNICODE + 1);
8182 goto onError;
8183 }
8184
8185 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8186 goto onError;
8187 }
8188 else if (PyUnicode_Check(item)) {
8189 if (PyUnicode_READY(item) == -1)
8190 goto onError;
8191 if (PyUnicode_GET_LENGTH(item) == 1) {
8192 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8193 if (value == 0xFFFE)
8194 goto Undefined;
8195 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8196 goto onError;
8197 }
8198 else {
8199 writer->overallocate = 1;
8200 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8201 goto onError;
8202 }
8203 }
8204 else {
8205 /* wrong return value */
8206 PyErr_SetString(PyExc_TypeError,
8207 "character mapping must return integer, None or str");
8208 goto onError;
8209 }
8210 Py_CLEAR(item);
8211 ++s;
8212 continue;
8213
8214Undefined:
8215 /* undefined mapping */
8216 Py_CLEAR(item);
8217 startinpos = s-starts;
8218 endinpos = startinpos+1;
8219 if (unicode_decode_call_errorhandler_writer(
8220 errors, &errorHandler,
8221 "charmap", "character maps to <undefined>",
8222 &starts, &e, &startinpos, &endinpos, &exc, &s,
8223 writer)) {
8224 goto onError;
8225 }
8226 }
8227 Py_XDECREF(errorHandler);
8228 Py_XDECREF(exc);
8229 return 0;
8230
8231onError:
8232 Py_XDECREF(item);
8233 Py_XDECREF(errorHandler);
8234 Py_XDECREF(exc);
8235 return -1;
8236}
8237
Alexander Belopolsky40018472011-02-26 01:02:56 +00008238PyObject *
8239PyUnicode_DecodeCharmap(const char *s,
8240 Py_ssize_t size,
8241 PyObject *mapping,
8242 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008244 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008245
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 /* Default to Latin-1 */
8247 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008251 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008252 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008253 writer.min_length = size;
8254 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008256
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008257 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008258 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8259 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008260 }
8261 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008262 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8263 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008265 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008266
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008268 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 return NULL;
8270}
8271
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272/* Charmap encoding: the lookup table */
8273
Alexander Belopolsky40018472011-02-26 01:02:56 +00008274struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 PyObject_HEAD
8276 unsigned char level1[32];
8277 int count2, count3;
8278 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279};
8280
8281static PyObject*
8282encoding_map_size(PyObject *obj, PyObject* args)
8283{
8284 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008285 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287}
8288
8289static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008290 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 PyDoc_STR("Return the size (in bytes) of this object") },
8292 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008293};
8294
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008295static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008296 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 "EncodingMap", /*tp_name*/
8298 sizeof(struct encoding_map), /*tp_basicsize*/
8299 0, /*tp_itemsize*/
8300 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008301 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008302 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 0, /*tp_getattr*/
8304 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008305 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 0, /*tp_repr*/
8307 0, /*tp_as_number*/
8308 0, /*tp_as_sequence*/
8309 0, /*tp_as_mapping*/
8310 0, /*tp_hash*/
8311 0, /*tp_call*/
8312 0, /*tp_str*/
8313 0, /*tp_getattro*/
8314 0, /*tp_setattro*/
8315 0, /*tp_as_buffer*/
8316 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8317 0, /*tp_doc*/
8318 0, /*tp_traverse*/
8319 0, /*tp_clear*/
8320 0, /*tp_richcompare*/
8321 0, /*tp_weaklistoffset*/
8322 0, /*tp_iter*/
8323 0, /*tp_iternext*/
8324 encoding_map_methods, /*tp_methods*/
8325 0, /*tp_members*/
8326 0, /*tp_getset*/
8327 0, /*tp_base*/
8328 0, /*tp_dict*/
8329 0, /*tp_descr_get*/
8330 0, /*tp_descr_set*/
8331 0, /*tp_dictoffset*/
8332 0, /*tp_init*/
8333 0, /*tp_alloc*/
8334 0, /*tp_new*/
8335 0, /*tp_free*/
8336 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337};
8338
8339PyObject*
8340PyUnicode_BuildEncodingMap(PyObject* string)
8341{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342 PyObject *result;
8343 struct encoding_map *mresult;
8344 int i;
8345 int need_dict = 0;
8346 unsigned char level1[32];
8347 unsigned char level2[512];
8348 unsigned char *mlevel1, *mlevel2, *mlevel3;
8349 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008351 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008352 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008354
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008355 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 PyErr_BadArgument();
8357 return NULL;
8358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 kind = PyUnicode_KIND(string);
8360 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008361 length = PyUnicode_GET_LENGTH(string);
8362 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 memset(level1, 0xFF, sizeof level1);
8364 memset(level2, 0xFF, sizeof level2);
8365
8366 /* If there isn't a one-to-one mapping of NULL to \0,
8367 or if there are non-BMP characters, we need to use
8368 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008371 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 ch = PyUnicode_READ(kind, data, i);
8374 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 need_dict = 1;
8376 break;
8377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008379 /* unmapped character */
8380 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 l1 = ch >> 11;
8382 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 if (level1[l1] == 0xFF)
8384 level1[l1] = count2++;
8385 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008386 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008387 }
8388
8389 if (count2 >= 0xFF || count3 >= 0xFF)
8390 need_dict = 1;
8391
8392 if (need_dict) {
8393 PyObject *result = PyDict_New();
8394 PyObject *key, *value;
8395 if (!result)
8396 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008397 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008399 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008400 if (!key || !value)
8401 goto failed1;
8402 if (PyDict_SetItem(result, key, value) == -1)
8403 goto failed1;
8404 Py_DECREF(key);
8405 Py_DECREF(value);
8406 }
8407 return result;
8408 failed1:
8409 Py_XDECREF(key);
8410 Py_XDECREF(value);
8411 Py_DECREF(result);
8412 return NULL;
8413 }
8414
8415 /* Create a three-level trie */
8416 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8417 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008418 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008419 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008420 }
8421
8422 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008423 mresult = (struct encoding_map*)result;
8424 mresult->count2 = count2;
8425 mresult->count3 = count3;
8426 mlevel1 = mresult->level1;
8427 mlevel2 = mresult->level23;
8428 mlevel3 = mresult->level23 + 16*count2;
8429 memcpy(mlevel1, level1, 32);
8430 memset(mlevel2, 0xFF, 16*count2);
8431 memset(mlevel3, 0, 128*count3);
8432 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008433 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008434 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008435 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8436 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008437 /* unmapped character */
8438 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008439 o1 = ch>>11;
8440 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008441 i2 = 16*mlevel1[o1] + o2;
8442 if (mlevel2[i2] == 0xFF)
8443 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008444 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445 i3 = 128*mlevel2[i2] + o3;
8446 mlevel3[i3] = i;
8447 }
8448 return result;
8449}
8450
8451static int
Victor Stinner22168992011-11-20 17:09:18 +01008452encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008453{
8454 struct encoding_map *map = (struct encoding_map*)mapping;
8455 int l1 = c>>11;
8456 int l2 = (c>>7) & 0xF;
8457 int l3 = c & 0x7F;
8458 int i;
8459
Victor Stinner22168992011-11-20 17:09:18 +01008460 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008462 if (c == 0)
8463 return 0;
8464 /* level 1*/
8465 i = map->level1[l1];
8466 if (i == 0xFF) {
8467 return -1;
8468 }
8469 /* level 2*/
8470 i = map->level23[16*i+l2];
8471 if (i == 0xFF) {
8472 return -1;
8473 }
8474 /* level 3 */
8475 i = map->level23[16*map->count2 + 128*i + l3];
8476 if (i == 0) {
8477 return -1;
8478 }
8479 return i;
8480}
8481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482/* Lookup the character ch in the mapping. If the character
8483 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008484 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008486charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487{
Christian Heimes217cfd12007-12-02 14:31:20 +00008488 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 PyObject *x;
8490
8491 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 x = PyObject_GetItem(mapping, w);
8494 Py_DECREF(w);
8495 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8497 /* No mapping found means: mapping is undefined. */
8498 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008499 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 } else
8501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008503 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008505 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 long value = PyLong_AS_LONG(x);
8507 if (value < 0 || value > 255) {
8508 PyErr_SetString(PyExc_TypeError,
8509 "character mapping must be in range(256)");
8510 Py_DECREF(x);
8511 return NULL;
8512 }
8513 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008514 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008515 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 /* wrong return value */
8519 PyErr_Format(PyExc_TypeError,
8520 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008521 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 Py_DECREF(x);
8523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 }
8525}
8526
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008528charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008529{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8531 /* exponentially overallocate to minimize reallocations */
8532 if (requiredsize < 2*outsize)
8533 requiredsize = 2*outsize;
8534 if (_PyBytes_Resize(outobj, requiredsize))
8535 return -1;
8536 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008537}
8538
Benjamin Peterson14339b62009-01-31 16:36:08 +00008539typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008541} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008543 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 space is available. Return a new reference to the object that
8545 was put in the output buffer, or Py_None, if the mapping was undefined
8546 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008547 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008549charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008550 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008552 PyObject *rep;
8553 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008554 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555
Andy Lesterdffe4c02020-03-04 07:15:20 -06008556 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008557 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008559 if (res == -1)
8560 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 if (outsize<requiredsize)
8562 if (charmapencode_resize(outobj, outpos, requiredsize))
8563 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008564 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 outstart[(*outpos)++] = (char)res;
8566 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008567 }
8568
8569 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008572 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 Py_DECREF(rep);
8574 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008575 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 if (PyLong_Check(rep)) {
8577 Py_ssize_t requiredsize = *outpos+1;
8578 if (outsize<requiredsize)
8579 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8580 Py_DECREF(rep);
8581 return enc_EXCEPTION;
8582 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008583 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 else {
8587 const char *repchars = PyBytes_AS_STRING(rep);
8588 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8589 Py_ssize_t requiredsize = *outpos+repsize;
8590 if (outsize<requiredsize)
8591 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8592 Py_DECREF(rep);
8593 return enc_EXCEPTION;
8594 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008595 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 memcpy(outstart + *outpos, repchars, repsize);
8597 *outpos += repsize;
8598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008600 Py_DECREF(rep);
8601 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602}
8603
8604/* handle an error in PyUnicode_EncodeCharmap
8605 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008606static int
8607charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008610 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008611 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612{
8613 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008614 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008615 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008616 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008617 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008618 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008620 Py_ssize_t collstartpos = *inpos;
8621 Py_ssize_t collendpos = *inpos+1;
8622 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008623 const char *encoding = "charmap";
8624 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008625 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008626 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008627 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628
Benjamin Petersonbac79492012-01-14 13:34:47 -05008629 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008630 return -1;
8631 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008632 /* find all unencodable characters */
8633 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008634 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008635 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008636 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008637 val = encoding_map_lookup(ch, mapping);
8638 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 break;
8640 ++collendpos;
8641 continue;
8642 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008643
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008644 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8645 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 if (rep==NULL)
8647 return -1;
8648 else if (rep!=Py_None) {
8649 Py_DECREF(rep);
8650 break;
8651 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008652 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 }
8655 /* cache callback name lookup
8656 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008657 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008658 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008659
8660 switch (*error_handler) {
8661 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008662 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008663 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008664
8665 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008666 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 x = charmapencode_output('?', mapping, res, respos);
8668 if (x==enc_EXCEPTION) {
8669 return -1;
8670 }
8671 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008672 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return -1;
8674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008675 }
8676 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008677 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008678 *inpos = collendpos;
8679 break;
Victor Stinner50149202015-09-22 00:26:54 +02008680
8681 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008682 /* generate replacement (temporarily (mis)uses p) */
8683 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 char buffer[2+29+1+1];
8685 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008686 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 for (cp = buffer; *cp; ++cp) {
8688 x = charmapencode_output(*cp, mapping, res, respos);
8689 if (x==enc_EXCEPTION)
8690 return -1;
8691 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008692 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 return -1;
8694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008695 }
8696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008697 *inpos = collendpos;
8698 break;
Victor Stinner50149202015-09-22 00:26:54 +02008699
Benjamin Peterson14339b62009-01-31 16:36:08 +00008700 default:
Victor Stinner50149202015-09-22 00:26:54 +02008701 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008702 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008704 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008706 if (PyBytes_Check(repunicode)) {
8707 /* Directly copy bytes result to output. */
8708 Py_ssize_t outsize = PyBytes_Size(*res);
8709 Py_ssize_t requiredsize;
8710 repsize = PyBytes_Size(repunicode);
8711 requiredsize = *respos + repsize;
8712 if (requiredsize > outsize)
8713 /* Make room for all additional bytes. */
8714 if (charmapencode_resize(res, respos, requiredsize)) {
8715 Py_DECREF(repunicode);
8716 return -1;
8717 }
8718 memcpy(PyBytes_AsString(*res) + *respos,
8719 PyBytes_AsString(repunicode), repsize);
8720 *respos += repsize;
8721 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008722 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008723 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008725 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008726 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008727 Py_DECREF(repunicode);
8728 return -1;
8729 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008730 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008731 data = PyUnicode_DATA(repunicode);
8732 kind = PyUnicode_KIND(repunicode);
8733 for (index = 0; index < repsize; index++) {
8734 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8735 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008737 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 return -1;
8739 }
8740 else if (x==enc_FAILED) {
8741 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008742 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 return -1;
8744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008745 }
8746 *inpos = newpos;
8747 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748 }
8749 return 0;
8750}
8751
Alexander Belopolsky40018472011-02-26 01:02:56 +00008752PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008753_PyUnicode_EncodeCharmap(PyObject *unicode,
8754 PyObject *mapping,
8755 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 /* output object */
8758 PyObject *res = NULL;
8759 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008760 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008761 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008763 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008764 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008766 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008767 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008768 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769
Benjamin Petersonbac79492012-01-14 13:34:47 -05008770 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771 return NULL;
8772 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008773 data = PyUnicode_DATA(unicode);
8774 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008775
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 /* Default to Latin-1 */
8777 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008778 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008780 /* allocate enough for a simple encoding without
8781 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008782 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 if (res == NULL)
8784 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008785 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008789 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008791 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 if (x==enc_EXCEPTION) /* error */
8793 goto onError;
8794 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008795 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008797 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 &res, &respos)) {
8799 goto onError;
8800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008801 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 else
8803 /* done with this character => adjust input position */
8804 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008808 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008809 if (_PyBytes_Resize(&res, respos) < 0)
8810 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008813 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814 return res;
8815
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008817 Py_XDECREF(res);
8818 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008819 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820 return NULL;
8821}
8822
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008823/* Deprecated */
8824PyObject *
8825PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8826 Py_ssize_t size,
8827 PyObject *mapping,
8828 const char *errors)
8829{
8830 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008831 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008832 if (unicode == NULL)
8833 return NULL;
8834 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8835 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008836 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008837}
8838
Alexander Belopolsky40018472011-02-26 01:02:56 +00008839PyObject *
8840PyUnicode_AsCharmapString(PyObject *unicode,
8841 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842{
8843 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 PyErr_BadArgument();
8845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008847 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848}
8849
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008850/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008851static void
8852make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008854 Py_ssize_t startpos, Py_ssize_t endpos,
8855 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 *exceptionObject = _PyUnicodeTranslateError_Create(
8859 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 }
8861 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8863 goto onError;
8864 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8865 goto onError;
8866 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8867 goto onError;
8868 return;
8869 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008870 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 }
8872}
8873
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008874/* error handling callback helper:
8875 build arguments, call the callback and check the arguments,
8876 put the result into newpos and return the replacement string, which
8877 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008878static PyObject *
8879unicode_translate_call_errorhandler(const char *errors,
8880 PyObject **errorHandler,
8881 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008883 Py_ssize_t startpos, Py_ssize_t endpos,
8884 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008885{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008886 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008887
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008888 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889 PyObject *restuple;
8890 PyObject *resunicode;
8891
8892 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008894 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008896 }
8897
8898 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008900 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008902
Petr Viktorinffd97532020-02-11 17:46:57 +01008903 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008904 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008906 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008907 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 Py_DECREF(restuple);
8909 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008911 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 &resunicode, &i_newpos)) {
8913 Py_DECREF(restuple);
8914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008916 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008918 else
8919 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008921 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 Py_DECREF(restuple);
8923 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008924 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008925 Py_INCREF(resunicode);
8926 Py_DECREF(restuple);
8927 return resunicode;
8928}
8929
8930/* Lookup the character ch in the mapping and put the result in result,
8931 which must be decrefed by the caller.
8932 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008933static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935{
Christian Heimes217cfd12007-12-02 14:31:20 +00008936 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937 PyObject *x;
8938
8939 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941 x = PyObject_GetItem(mapping, w);
8942 Py_DECREF(w);
8943 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8945 /* No mapping found means: use 1:1 mapping. */
8946 PyErr_Clear();
8947 *result = NULL;
8948 return 0;
8949 } else
8950 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008951 }
8952 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 *result = x;
8954 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008956 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008958 if (value < 0 || value > MAX_UNICODE) {
8959 PyErr_Format(PyExc_ValueError,
8960 "character mapping must be in range(0x%x)",
8961 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 Py_DECREF(x);
8963 return -1;
8964 }
8965 *result = x;
8966 return 0;
8967 }
8968 else if (PyUnicode_Check(x)) {
8969 *result = x;
8970 return 0;
8971 }
8972 else {
8973 /* wrong return value */
8974 PyErr_SetString(PyExc_TypeError,
8975 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008976 Py_DECREF(x);
8977 return -1;
8978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008979}
Victor Stinner1194ea02014-04-04 19:37:40 +02008980
8981/* lookup the character, write the result into the writer.
8982 Return 1 if the result was written into the writer, return 0 if the mapping
8983 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008984static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008985charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8986 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008987{
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 PyObject *item;
8989
8990 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008992
8993 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008995 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008998 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008999 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009000
9001 if (item == Py_None) {
9002 Py_DECREF(item);
9003 return 0;
9004 }
9005
9006 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009007 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9008 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9009 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009010 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9011 Py_DECREF(item);
9012 return -1;
9013 }
9014 Py_DECREF(item);
9015 return 1;
9016 }
9017
9018 if (!PyUnicode_Check(item)) {
9019 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009021 }
9022
9023 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9024 Py_DECREF(item);
9025 return -1;
9026 }
9027
9028 Py_DECREF(item);
9029 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030}
9031
Victor Stinner89a76ab2014-04-05 11:44:04 +02009032static int
9033unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9034 Py_UCS1 *translate)
9035{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009036 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009037 int ret = 0;
9038
Victor Stinner89a76ab2014-04-05 11:44:04 +02009039 if (charmaptranslate_lookup(ch, mapping, &item)) {
9040 return -1;
9041 }
9042
9043 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009044 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009045 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009046 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009047 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009048 /* not found => default to 1:1 mapping */
9049 translate[ch] = ch;
9050 return 1;
9051 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009052 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009053 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009054 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9055 used it */
9056 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009057 /* invalid character or character outside ASCII:
9058 skip the fast translate */
9059 goto exit;
9060 }
9061 translate[ch] = (Py_UCS1)replace;
9062 }
9063 else if (PyUnicode_Check(item)) {
9064 Py_UCS4 replace;
9065
9066 if (PyUnicode_READY(item) == -1) {
9067 Py_DECREF(item);
9068 return -1;
9069 }
9070 if (PyUnicode_GET_LENGTH(item) != 1)
9071 goto exit;
9072
9073 replace = PyUnicode_READ_CHAR(item, 0);
9074 if (replace > 127)
9075 goto exit;
9076 translate[ch] = (Py_UCS1)replace;
9077 }
9078 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009079 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009080 goto exit;
9081 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009082 ret = 1;
9083
Benjamin Peterson1365de72014-04-07 20:15:41 -04009084 exit:
9085 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009086 return ret;
9087}
9088
9089/* Fast path for ascii => ascii translation. Return 1 if the whole string
9090 was translated into writer, return 0 if the input string was partially
9091 translated into writer, raise an exception and return -1 on error. */
9092static int
9093unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009094 _PyUnicodeWriter *writer, int ignore,
9095 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009096{
Victor Stinner872b2912014-04-05 14:27:07 +02009097 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009098 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009099 const Py_UCS1 *in, *end;
9100 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009101 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009102
Victor Stinner89a76ab2014-04-05 11:44:04 +02009103 len = PyUnicode_GET_LENGTH(input);
9104
Victor Stinner872b2912014-04-05 14:27:07 +02009105 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009106
9107 in = PyUnicode_1BYTE_DATA(input);
9108 end = in + len;
9109
9110 assert(PyUnicode_IS_ASCII(writer->buffer));
9111 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9112 out = PyUnicode_1BYTE_DATA(writer->buffer);
9113
Victor Stinner872b2912014-04-05 14:27:07 +02009114 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009115 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009116 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009117 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009118 int translate = unicode_fast_translate_lookup(mapping, ch,
9119 ascii_table);
9120 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009121 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009122 if (translate == 0)
9123 goto exit;
9124 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009125 }
Victor Stinner872b2912014-04-05 14:27:07 +02009126 if (ch2 == 0xfe) {
9127 if (ignore)
9128 continue;
9129 goto exit;
9130 }
9131 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009132 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009133 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009134 }
Victor Stinner872b2912014-04-05 14:27:07 +02009135 res = 1;
9136
9137exit:
9138 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009139 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009140 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009141}
9142
Victor Stinner3222da22015-10-01 22:07:32 +02009143static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144_PyUnicode_TranslateCharmap(PyObject *input,
9145 PyObject *mapping,
9146 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009149 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 Py_ssize_t size, i;
9151 int kind;
9152 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009153 _PyUnicodeWriter writer;
9154 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009155 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009156 PyObject *errorHandler = NULL;
9157 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009158 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009159 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009160
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 PyErr_BadArgument();
9163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 if (PyUnicode_READY(input) == -1)
9167 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009168 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 kind = PyUnicode_KIND(input);
9170 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009172 if (size == 0)
9173 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009175 /* allocate enough for a simple 1:1 translation without
9176 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009177 _PyUnicodeWriter_Init(&writer);
9178 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180
Victor Stinner872b2912014-04-05 14:27:07 +02009181 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9182
Victor Stinner33798672016-03-01 21:59:58 +01009183 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009184 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009185 if (PyUnicode_IS_ASCII(input)) {
9186 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9187 if (res < 0) {
9188 _PyUnicodeWriter_Dealloc(&writer);
9189 return NULL;
9190 }
9191 if (res == 1)
9192 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009193 }
Victor Stinner33798672016-03-01 21:59:58 +01009194 else {
9195 i = 0;
9196 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009200 int translate;
9201 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9202 Py_ssize_t newpos;
9203 /* startpos for collecting untranslatable chars */
9204 Py_ssize_t collstart;
9205 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009206 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207
Victor Stinner1194ea02014-04-04 19:37:40 +02009208 ch = PyUnicode_READ(kind, data, i);
9209 translate = charmaptranslate_output(ch, mapping, &writer);
9210 if (translate < 0)
9211 goto onError;
9212
9213 if (translate != 0) {
9214 /* it worked => adjust input pointer */
9215 ++i;
9216 continue;
9217 }
9218
9219 /* untranslatable character */
9220 collstart = i;
9221 collend = i+1;
9222
9223 /* find all untranslatable characters */
9224 while (collend < size) {
9225 PyObject *x;
9226 ch = PyUnicode_READ(kind, data, collend);
9227 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009228 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009229 Py_XDECREF(x);
9230 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009232 ++collend;
9233 }
9234
9235 if (ignore) {
9236 i = collend;
9237 }
9238 else {
9239 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9240 reason, input, &exc,
9241 collstart, collend, &newpos);
9242 if (repunicode == NULL)
9243 goto onError;
9244 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009246 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009247 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009248 Py_DECREF(repunicode);
9249 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009250 }
9251 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009252 Py_XDECREF(exc);
9253 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009254 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255
Benjamin Peterson29060642009-01-31 22:14:21 +00009256 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009257 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009258 Py_XDECREF(exc);
9259 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260 return NULL;
9261}
9262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263/* Deprecated. Use PyUnicode_Translate instead. */
9264PyObject *
9265PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9266 Py_ssize_t size,
9267 PyObject *mapping,
9268 const char *errors)
9269{
Christian Heimes5f520f42012-09-11 14:03:25 +02009270 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009271 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 if (!unicode)
9273 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009274 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9275 Py_DECREF(unicode);
9276 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277}
9278
Alexander Belopolsky40018472011-02-26 01:02:56 +00009279PyObject *
9280PyUnicode_Translate(PyObject *str,
9281 PyObject *mapping,
9282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009284 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009285 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009286 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287}
Tim Petersced69f82003-09-16 20:30:58 +00009288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289PyObject *
9290_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9291{
9292 if (!PyUnicode_Check(unicode)) {
9293 PyErr_BadInternalCall();
9294 return NULL;
9295 }
9296 if (PyUnicode_READY(unicode) == -1)
9297 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009298 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 /* If the string is already ASCII, just return the same string */
9300 Py_INCREF(unicode);
9301 return unicode;
9302 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009303
9304 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9305 PyObject *result = PyUnicode_New(len, 127);
9306 if (result == NULL) {
9307 return NULL;
9308 }
9309
9310 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9311 int kind = PyUnicode_KIND(unicode);
9312 const void *data = PyUnicode_DATA(unicode);
9313 Py_ssize_t i;
9314 for (i = 0; i < len; ++i) {
9315 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9316 if (ch < 127) {
9317 out[i] = ch;
9318 }
9319 else if (Py_UNICODE_ISSPACE(ch)) {
9320 out[i] = ' ';
9321 }
9322 else {
9323 int decimal = Py_UNICODE_TODECIMAL(ch);
9324 if (decimal < 0) {
9325 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009326 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009327 _PyUnicode_LENGTH(result) = i + 1;
9328 break;
9329 }
9330 out[i] = '0' + decimal;
9331 }
9332 }
9333
INADA Naoki16dfca42018-07-14 12:06:43 +09009334 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009335 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336}
9337
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009338PyObject *
9339PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9340 Py_ssize_t length)
9341{
Victor Stinnerf0124502011-11-21 23:12:56 +01009342 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009343 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009344 Py_UCS4 maxchar;
9345 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009346 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009347
Victor Stinner99d7ad02012-02-22 13:37:39 +01009348 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009349 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009350 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009351 if (ch > 127) {
9352 int decimal = Py_UNICODE_TODECIMAL(ch);
9353 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009354 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009355 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009356 }
9357 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009358
9359 /* Copy to a new string */
9360 decimal = PyUnicode_New(length, maxchar);
9361 if (decimal == NULL)
9362 return decimal;
9363 kind = PyUnicode_KIND(decimal);
9364 data = PyUnicode_DATA(decimal);
9365 /* Iterate over code points */
9366 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009367 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009368 if (ch > 127) {
9369 int decimal = Py_UNICODE_TODECIMAL(ch);
9370 if (decimal >= 0)
9371 ch = '0' + decimal;
9372 }
9373 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009375 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009376}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009377/* --- Decimal Encoder ---------------------------------------------------- */
9378
Alexander Belopolsky40018472011-02-26 01:02:56 +00009379int
9380PyUnicode_EncodeDecimal(Py_UNICODE *s,
9381 Py_ssize_t length,
9382 char *output,
9383 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009384{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009385 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009386 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009387 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009388 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009389
9390 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 PyErr_BadArgument();
9392 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009393 }
9394
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009395 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009396 if (unicode == NULL)
9397 return -1;
9398
Victor Stinner42bf7752011-11-21 22:52:58 +01009399 kind = PyUnicode_KIND(unicode);
9400 data = PyUnicode_DATA(unicode);
9401
Victor Stinnerb84d7232011-11-22 01:50:07 +01009402 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009403 PyObject *exc;
9404 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009406 Py_ssize_t startpos;
9407
9408 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009409
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009411 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009412 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009414 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009415 decimal = Py_UNICODE_TODECIMAL(ch);
9416 if (decimal >= 0) {
9417 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009418 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009419 continue;
9420 }
9421 if (0 < ch && ch < 256) {
9422 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009423 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 continue;
9425 }
Victor Stinner6345be92011-11-25 20:09:01 +01009426
Victor Stinner42bf7752011-11-21 22:52:58 +01009427 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009428 exc = NULL;
9429 raise_encode_exception(&exc, "decimal", unicode,
9430 startpos, startpos+1,
9431 "invalid decimal Unicode string");
9432 Py_XDECREF(exc);
9433 Py_DECREF(unicode);
9434 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009435 }
9436 /* 0-terminate the output string */
9437 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009438 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009439 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009440}
9441
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442/* --- Helpers ------------------------------------------------------------ */
9443
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009444/* helper macro to fixup start/end slice values */
9445#define ADJUST_INDICES(start, end, len) \
9446 if (end > len) \
9447 end = len; \
9448 else if (end < 0) { \
9449 end += len; \
9450 if (end < 0) \
9451 end = 0; \
9452 } \
9453 if (start < 0) { \
9454 start += len; \
9455 if (start < 0) \
9456 start = 0; \
9457 }
9458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009460any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009462 Py_ssize_t end,
9463 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009465 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009466 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 Py_ssize_t len1, len2, result;
9468
9469 kind1 = PyUnicode_KIND(s1);
9470 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009471 if (kind1 < kind2)
9472 return -1;
9473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 len1 = PyUnicode_GET_LENGTH(s1);
9475 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009476 ADJUST_INDICES(start, end, len1);
9477 if (end - start < len2)
9478 return -1;
9479
9480 buf1 = PyUnicode_DATA(s1);
9481 buf2 = PyUnicode_DATA(s2);
9482 if (len2 == 1) {
9483 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9484 result = findchar((const char *)buf1 + kind1*start,
9485 kind1, end - start, ch, direction);
9486 if (result == -1)
9487 return -1;
9488 else
9489 return start + result;
9490 }
9491
9492 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009493 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009494 if (!buf2)
9495 return -2;
9496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497
Victor Stinner794d5672011-10-10 03:21:36 +02009498 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009499 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009500 case PyUnicode_1BYTE_KIND:
9501 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9502 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9503 else
9504 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9505 break;
9506 case PyUnicode_2BYTE_KIND:
9507 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9508 break;
9509 case PyUnicode_4BYTE_KIND:
9510 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9511 break;
9512 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009513 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009514 }
9515 }
9516 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009517 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009518 case PyUnicode_1BYTE_KIND:
9519 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9520 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9521 else
9522 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9523 break;
9524 case PyUnicode_2BYTE_KIND:
9525 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9526 break;
9527 case PyUnicode_4BYTE_KIND:
9528 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9529 break;
9530 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009531 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009532 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 }
9534
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009535 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009536 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009537 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538
9539 return result;
9540}
9541
Victor Stinner59423e32018-11-26 13:40:01 +01009542/* _PyUnicode_InsertThousandsGrouping() helper functions */
9543#include "stringlib/localeutil.h"
9544
9545/**
9546 * InsertThousandsGrouping:
9547 * @writer: Unicode writer.
9548 * @n_buffer: Number of characters in @buffer.
9549 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9550 * @d_pos: Start of digits string.
9551 * @n_digits: The number of digits in the string, in which we want
9552 * to put the grouping chars.
9553 * @min_width: The minimum width of the digits in the output string.
9554 * Output will be zero-padded on the left to fill.
9555 * @grouping: see definition in localeconv().
9556 * @thousands_sep: see definition in localeconv().
9557 *
9558 * There are 2 modes: counting and filling. If @writer is NULL,
9559 * we are in counting mode, else filling mode.
9560 * If counting, the required buffer size is returned.
9561 * If filling, we know the buffer will be large enough, so we don't
9562 * need to pass in the buffer size.
9563 * Inserts thousand grouping characters (as defined by grouping and
9564 * thousands_sep) into @writer.
9565 *
9566 * Return value: -1 on error, number of characters otherwise.
9567 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009569_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009570 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009571 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009572 PyObject *digits,
9573 Py_ssize_t d_pos,
9574 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009575 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009576 const char *grouping,
9577 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009578 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579{
Xtreak3f7983a2019-01-07 20:39:14 +05309580 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009581 if (writer) {
9582 assert(digits != NULL);
9583 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009584 }
9585 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009586 assert(digits == NULL);
9587 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009588 }
Victor Stinner59423e32018-11-26 13:40:01 +01009589 assert(0 <= d_pos);
9590 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009591 assert(grouping != NULL);
9592
9593 if (digits != NULL) {
9594 if (PyUnicode_READY(digits) == -1) {
9595 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009596 }
Victor Stinner59423e32018-11-26 13:40:01 +01009597 }
9598 if (PyUnicode_READY(thousands_sep) == -1) {
9599 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009600 }
9601
Victor Stinner59423e32018-11-26 13:40:01 +01009602 Py_ssize_t count = 0;
9603 Py_ssize_t n_zeros;
9604 int loop_broken = 0;
9605 int use_separator = 0; /* First time through, don't append the
9606 separator. They only go between
9607 groups. */
9608 Py_ssize_t buffer_pos;
9609 Py_ssize_t digits_pos;
9610 Py_ssize_t len;
9611 Py_ssize_t n_chars;
9612 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9613 be looked at */
9614 /* A generator that returns all of the grouping widths, until it
9615 returns 0. */
9616 GroupGenerator groupgen;
9617 GroupGenerator_init(&groupgen, grouping);
9618 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9619
9620 /* if digits are not grouped, thousands separator
9621 should be an empty string */
9622 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9623
9624 digits_pos = d_pos + n_digits;
9625 if (writer) {
9626 buffer_pos = writer->pos + n_buffer;
9627 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9628 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 }
Victor Stinner59423e32018-11-26 13:40:01 +01009630 else {
9631 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009632 }
Victor Stinner59423e32018-11-26 13:40:01 +01009633
9634 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009635 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009636 }
Victor Stinner59423e32018-11-26 13:40:01 +01009637
9638 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9639 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9640 n_zeros = Py_MAX(0, len - remaining);
9641 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9642
9643 /* Use n_zero zero's and n_chars chars */
9644
9645 /* Count only, don't do anything. */
9646 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9647
9648 /* Copy into the writer. */
9649 InsertThousandsGrouping_fill(writer, &buffer_pos,
9650 digits, &digits_pos,
9651 n_chars, n_zeros,
9652 use_separator ? thousands_sep : NULL,
9653 thousands_sep_len, maxchar);
9654
9655 /* Use a separator next time. */
9656 use_separator = 1;
9657
9658 remaining -= n_chars;
9659 min_width -= len;
9660
9661 if (remaining <= 0 && min_width <= 0) {
9662 loop_broken = 1;
9663 break;
9664 }
9665 min_width -= thousands_sep_len;
9666 }
9667 if (!loop_broken) {
9668 /* We left the loop without using a break statement. */
9669
9670 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9671 n_zeros = Py_MAX(0, len - remaining);
9672 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9673
9674 /* Use n_zero zero's and n_chars chars */
9675 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9676
9677 /* Copy into the writer. */
9678 InsertThousandsGrouping_fill(writer, &buffer_pos,
9679 digits, &digits_pos,
9680 n_chars, n_zeros,
9681 use_separator ? thousands_sep : NULL,
9682 thousands_sep_len, maxchar);
9683 }
9684 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685}
9686
9687
Alexander Belopolsky40018472011-02-26 01:02:56 +00009688Py_ssize_t
9689PyUnicode_Count(PyObject *str,
9690 PyObject *substr,
9691 Py_ssize_t start,
9692 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009694 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009695 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009696 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009699 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009701
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009702 kind1 = PyUnicode_KIND(str);
9703 kind2 = PyUnicode_KIND(substr);
9704 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009705 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009706
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009707 len1 = PyUnicode_GET_LENGTH(str);
9708 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009710 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009711 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009712
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009713 buf1 = PyUnicode_DATA(str);
9714 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009715 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009716 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009717 if (!buf2)
9718 goto onError;
9719 }
9720
9721 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009723 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009724 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009725 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009726 buf2, len2, PY_SSIZE_T_MAX
9727 );
9728 else
9729 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009730 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009731 buf2, len2, PY_SSIZE_T_MAX
9732 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 break;
9734 case PyUnicode_2BYTE_KIND:
9735 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009736 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 buf2, len2, PY_SSIZE_T_MAX
9738 );
9739 break;
9740 case PyUnicode_4BYTE_KIND:
9741 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009742 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 buf2, len2, PY_SSIZE_T_MAX
9744 );
9745 break;
9746 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009747 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009749
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009750 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009751 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009752 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009753
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009756 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9757 if (kind2 != kind1)
9758 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760}
9761
Alexander Belopolsky40018472011-02-26 01:02:56 +00009762Py_ssize_t
9763PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009764 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009765 Py_ssize_t start,
9766 Py_ssize_t end,
9767 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009769 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009770 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009771
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009772 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773}
9774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775Py_ssize_t
9776PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9777 Py_ssize_t start, Py_ssize_t end,
9778 int direction)
9779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009781 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 if (PyUnicode_READY(str) == -1)
9783 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009784 len = PyUnicode_GET_LENGTH(str);
9785 ADJUST_INDICES(start, end, len);
9786 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009787 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009789 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9790 kind, end-start, ch, direction);
9791 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009793 else
9794 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795}
9796
Alexander Belopolsky40018472011-02-26 01:02:56 +00009797static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009798tailmatch(PyObject *self,
9799 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009800 Py_ssize_t start,
9801 Py_ssize_t end,
9802 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 int kind_self;
9805 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009806 const void *data_self;
9807 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 Py_ssize_t offset;
9809 Py_ssize_t i;
9810 Py_ssize_t end_sub;
9811
9812 if (PyUnicode_READY(self) == -1 ||
9813 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009814 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9817 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009819 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009821 if (PyUnicode_GET_LENGTH(substring) == 0)
9822 return 1;
9823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 kind_self = PyUnicode_KIND(self);
9825 data_self = PyUnicode_DATA(self);
9826 kind_sub = PyUnicode_KIND(substring);
9827 data_sub = PyUnicode_DATA(substring);
9828 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9829
9830 if (direction > 0)
9831 offset = end;
9832 else
9833 offset = start;
9834
9835 if (PyUnicode_READ(kind_self, data_self, offset) ==
9836 PyUnicode_READ(kind_sub, data_sub, 0) &&
9837 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9838 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9839 /* If both are of the same kind, memcmp is sufficient */
9840 if (kind_self == kind_sub) {
9841 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009842 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 data_sub,
9844 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009845 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009847 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 else {
9849 /* We do not need to compare 0 and len(substring)-1 because
9850 the if statement above ensured already that they are equal
9851 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 for (i = 1; i < end_sub; ++i) {
9853 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9854 PyUnicode_READ(kind_sub, data_sub, i))
9855 return 0;
9856 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 }
9860
9861 return 0;
9862}
9863
Alexander Belopolsky40018472011-02-26 01:02:56 +00009864Py_ssize_t
9865PyUnicode_Tailmatch(PyObject *str,
9866 PyObject *substr,
9867 Py_ssize_t start,
9868 Py_ssize_t end,
9869 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009871 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009872 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009873
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009874 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875}
9876
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009877static PyObject *
9878ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009881 const char *data = PyUnicode_DATA(self);
9882 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009883 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009884
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009885 res = PyUnicode_New(len, 127);
9886 if (res == NULL)
9887 return NULL;
9888 resdata = PyUnicode_DATA(res);
9889 if (lower)
9890 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009892 _Py_bytes_upper(resdata, data, len);
9893 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894}
9895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009897handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009899 Py_ssize_t j;
9900 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009901 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009902 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009903
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009904 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9905
9906 where ! is a negation and \p{xxx} is a character with property xxx.
9907 */
9908 for (j = i - 1; j >= 0; j--) {
9909 c = PyUnicode_READ(kind, data, j);
9910 if (!_PyUnicode_IsCaseIgnorable(c))
9911 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009913 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9914 if (final_sigma) {
9915 for (j = i + 1; j < length; j++) {
9916 c = PyUnicode_READ(kind, data, j);
9917 if (!_PyUnicode_IsCaseIgnorable(c))
9918 break;
9919 }
9920 final_sigma = j == length || !_PyUnicode_IsCased(c);
9921 }
9922 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923}
9924
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009925static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009926lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009927 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009929 /* Obscure special case. */
9930 if (c == 0x3A3) {
9931 mapped[0] = handle_capital_sigma(kind, data, length, i);
9932 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009934 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935}
9936
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009937static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009938do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009940 Py_ssize_t i, k = 0;
9941 int n_res, j;
9942 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009943
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009944 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009945 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009946 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009947 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009948 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009950 for (i = 1; i < length; i++) {
9951 c = PyUnicode_READ(kind, data, i);
9952 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9953 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009954 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009955 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009956 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009957 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009958 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959}
9960
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009961static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009962do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009963 Py_ssize_t i, k = 0;
9964
9965 for (i = 0; i < length; i++) {
9966 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9967 int n_res, j;
9968 if (Py_UNICODE_ISUPPER(c)) {
9969 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9970 }
9971 else if (Py_UNICODE_ISLOWER(c)) {
9972 n_res = _PyUnicode_ToUpperFull(c, mapped);
9973 }
9974 else {
9975 n_res = 1;
9976 mapped[0] = c;
9977 }
9978 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009979 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009980 res[k++] = mapped[j];
9981 }
9982 }
9983 return k;
9984}
9985
9986static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009987do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009988 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009990 Py_ssize_t i, k = 0;
9991
9992 for (i = 0; i < length; i++) {
9993 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9994 int n_res, j;
9995 if (lower)
9996 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9997 else
9998 n_res = _PyUnicode_ToUpperFull(c, mapped);
9999 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010000 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010001 res[k++] = mapped[j];
10002 }
10003 }
10004 return k;
10005}
10006
10007static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010008do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010009{
10010 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10011}
10012
10013static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010014do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010015{
10016 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10017}
10018
Benjamin Petersone51757f2012-01-12 21:10:29 -050010019static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010020do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010021{
10022 Py_ssize_t i, k = 0;
10023
10024 for (i = 0; i < length; i++) {
10025 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10026 Py_UCS4 mapped[3];
10027 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10028 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010029 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010030 res[k++] = mapped[j];
10031 }
10032 }
10033 return k;
10034}
10035
10036static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010037do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010038{
10039 Py_ssize_t i, k = 0;
10040 int previous_is_cased;
10041
10042 previous_is_cased = 0;
10043 for (i = 0; i < length; i++) {
10044 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10045 Py_UCS4 mapped[3];
10046 int n_res, j;
10047
10048 if (previous_is_cased)
10049 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10050 else
10051 n_res = _PyUnicode_ToTitleFull(c, mapped);
10052
10053 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010054 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010055 res[k++] = mapped[j];
10056 }
10057
10058 previous_is_cased = _PyUnicode_IsCased(c);
10059 }
10060 return k;
10061}
10062
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010063static PyObject *
10064case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010065 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010066{
10067 PyObject *res = NULL;
10068 Py_ssize_t length, newlength = 0;
10069 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010070 const void *data;
10071 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010072 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10073
Benjamin Petersoneea48462012-01-16 14:28:50 -050010074 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010075
10076 kind = PyUnicode_KIND(self);
10077 data = PyUnicode_DATA(self);
10078 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010079 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010080 PyErr_SetString(PyExc_OverflowError, "string is too long");
10081 return NULL;
10082 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010083 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010084 if (tmp == NULL)
10085 return PyErr_NoMemory();
10086 newlength = perform(kind, data, length, tmp, &maxchar);
10087 res = PyUnicode_New(newlength, maxchar);
10088 if (res == NULL)
10089 goto leave;
10090 tmpend = tmp + newlength;
10091 outdata = PyUnicode_DATA(res);
10092 outkind = PyUnicode_KIND(res);
10093 switch (outkind) {
10094 case PyUnicode_1BYTE_KIND:
10095 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10096 break;
10097 case PyUnicode_2BYTE_KIND:
10098 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10099 break;
10100 case PyUnicode_4BYTE_KIND:
10101 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10102 break;
10103 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010104 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010105 }
10106 leave:
10107 PyMem_FREE(tmp);
10108 return res;
10109}
10110
Tim Peters8ce9f162004-08-27 01:49:32 +000010111PyObject *
10112PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010114 PyObject *res;
10115 PyObject *fseq;
10116 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010117 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010118
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010119 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010120 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010121 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010122 }
10123
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010124 /* NOTE: the following code can't call back into Python code,
10125 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010126 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010127
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010128 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010129 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010130 res = _PyUnicode_JoinArray(separator, items, seqlen);
10131 Py_DECREF(fseq);
10132 return res;
10133}
10134
10135PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010136_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010137{
10138 PyObject *res = NULL; /* the result */
10139 PyObject *sep = NULL;
10140 Py_ssize_t seplen;
10141 PyObject *item;
10142 Py_ssize_t sz, i, res_offset;
10143 Py_UCS4 maxchar;
10144 Py_UCS4 item_maxchar;
10145 int use_memcpy;
10146 unsigned char *res_data = NULL, *sep_data = NULL;
10147 PyObject *last_obj;
10148 unsigned int kind = 0;
10149
Tim Peters05eba1f2004-08-27 21:32:02 +000010150 /* If empty sequence, return u"". */
10151 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010152 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010153 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010154
Tim Peters05eba1f2004-08-27 21:32:02 +000010155 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010156 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010157 if (seqlen == 1) {
10158 if (PyUnicode_CheckExact(items[0])) {
10159 res = items[0];
10160 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010161 return res;
10162 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010163 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010164 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010165 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010166 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010167 /* Set up sep and seplen */
10168 if (separator == NULL) {
10169 /* fall back to a blank space separator */
10170 sep = PyUnicode_FromOrdinal(' ');
10171 if (!sep)
10172 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010173 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010174 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010175 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010176 else {
10177 if (!PyUnicode_Check(separator)) {
10178 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010179 "separator: expected str instance,"
10180 " %.80s found",
10181 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010182 goto onError;
10183 }
10184 if (PyUnicode_READY(separator))
10185 goto onError;
10186 sep = separator;
10187 seplen = PyUnicode_GET_LENGTH(separator);
10188 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10189 /* inc refcount to keep this code path symmetric with the
10190 above case of a blank separator */
10191 Py_INCREF(sep);
10192 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010193 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010194 }
10195
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010196 /* There are at least two things to join, or else we have a subclass
10197 * of str in the sequence.
10198 * Do a pre-pass to figure out the total amount of space we'll
10199 * need (sz), and see whether all argument are strings.
10200 */
10201 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010202#ifdef Py_DEBUG
10203 use_memcpy = 0;
10204#else
10205 use_memcpy = 1;
10206#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010207 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010208 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010209 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010210 if (!PyUnicode_Check(item)) {
10211 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010212 "sequence item %zd: expected str instance,"
10213 " %.80s found",
10214 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010215 goto onError;
10216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (PyUnicode_READY(item) == -1)
10218 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010219 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010221 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010222 if (i != 0) {
10223 add_sz += seplen;
10224 }
10225 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010226 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010227 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010228 goto onError;
10229 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010230 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010231 if (use_memcpy && last_obj != NULL) {
10232 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10233 use_memcpy = 0;
10234 }
10235 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010236 }
Tim Petersced69f82003-09-16 20:30:58 +000010237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010239 if (res == NULL)
10240 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010241
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010242 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010243#ifdef Py_DEBUG
10244 use_memcpy = 0;
10245#else
10246 if (use_memcpy) {
10247 res_data = PyUnicode_1BYTE_DATA(res);
10248 kind = PyUnicode_KIND(res);
10249 if (seplen != 0)
10250 sep_data = PyUnicode_1BYTE_DATA(sep);
10251 }
10252#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010253 if (use_memcpy) {
10254 for (i = 0; i < seqlen; ++i) {
10255 Py_ssize_t itemlen;
10256 item = items[i];
10257
10258 /* Copy item, and maybe the separator. */
10259 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010260 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010261 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010262 kind * seplen);
10263 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010264 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010265
10266 itemlen = PyUnicode_GET_LENGTH(item);
10267 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010268 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010269 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010270 kind * itemlen);
10271 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010272 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010273 }
10274 assert(res_data == PyUnicode_1BYTE_DATA(res)
10275 + kind * PyUnicode_GET_LENGTH(res));
10276 }
10277 else {
10278 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10279 Py_ssize_t itemlen;
10280 item = items[i];
10281
10282 /* Copy item, and maybe the separator. */
10283 if (i && seplen != 0) {
10284 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10285 res_offset += seplen;
10286 }
10287
10288 itemlen = PyUnicode_GET_LENGTH(item);
10289 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010290 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010291 res_offset += itemlen;
10292 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010293 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010294 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010295 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010298 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010303 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304 return NULL;
10305}
10306
Victor Stinnerd3f08822012-05-29 12:57:52 +020010307void
10308_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10309 Py_UCS4 fill_char)
10310{
10311 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010312 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010313 assert(PyUnicode_IS_READY(unicode));
10314 assert(unicode_modifiable(unicode));
10315 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10316 assert(start >= 0);
10317 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010318 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010319}
10320
Victor Stinner3fe55312012-01-04 00:33:50 +010010321Py_ssize_t
10322PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10323 Py_UCS4 fill_char)
10324{
10325 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010326
10327 if (!PyUnicode_Check(unicode)) {
10328 PyErr_BadInternalCall();
10329 return -1;
10330 }
10331 if (PyUnicode_READY(unicode) == -1)
10332 return -1;
10333 if (unicode_check_modifiable(unicode))
10334 return -1;
10335
Victor Stinnerd3f08822012-05-29 12:57:52 +020010336 if (start < 0) {
10337 PyErr_SetString(PyExc_IndexError, "string index out of range");
10338 return -1;
10339 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010340 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10341 PyErr_SetString(PyExc_ValueError,
10342 "fill character is bigger than "
10343 "the string maximum character");
10344 return -1;
10345 }
10346
10347 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10348 length = Py_MIN(maxlen, length);
10349 if (length <= 0)
10350 return 0;
10351
Victor Stinnerd3f08822012-05-29 12:57:52 +020010352 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010353 return length;
10354}
10355
Victor Stinner9310abb2011-10-05 00:59:23 +020010356static PyObject *
10357pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010358 Py_ssize_t left,
10359 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 PyObject *u;
10363 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010364 int kind;
10365 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366
10367 if (left < 0)
10368 left = 0;
10369 if (right < 0)
10370 right = 0;
10371
Victor Stinnerc4b49542011-12-11 22:44:26 +010010372 if (left == 0 && right == 0)
10373 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10376 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010377 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10378 return NULL;
10379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010381 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010383 if (!u)
10384 return NULL;
10385
10386 kind = PyUnicode_KIND(u);
10387 data = PyUnicode_DATA(u);
10388 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010389 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010390 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010391 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010392 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010393 assert(_PyUnicode_CheckConsistency(u, 1));
10394 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395}
10396
Alexander Belopolsky40018472011-02-26 01:02:56 +000010397PyObject *
10398PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010402 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404
Benjamin Petersonead6b532011-12-20 17:23:42 -060010405 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010407 if (PyUnicode_IS_ASCII(string))
10408 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010409 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010410 PyUnicode_GET_LENGTH(string), keepends);
10411 else
10412 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010413 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010414 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 break;
10416 case PyUnicode_2BYTE_KIND:
10417 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010418 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 PyUnicode_GET_LENGTH(string), keepends);
10420 break;
10421 case PyUnicode_4BYTE_KIND:
10422 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010423 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 PyUnicode_GET_LENGTH(string), keepends);
10425 break;
10426 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010427 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430}
10431
Alexander Belopolsky40018472011-02-26 01:02:56 +000010432static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010433split(PyObject *self,
10434 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010435 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010437 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010438 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 Py_ssize_t len1, len2;
10440 PyObject* out;
10441
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010443 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 if (PyUnicode_READY(self) == -1)
10446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010449 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010451 if (PyUnicode_IS_ASCII(self))
10452 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010453 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010454 PyUnicode_GET_LENGTH(self), maxcount
10455 );
10456 else
10457 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010458 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010459 PyUnicode_GET_LENGTH(self), maxcount
10460 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 case PyUnicode_2BYTE_KIND:
10462 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010463 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 PyUnicode_GET_LENGTH(self), maxcount
10465 );
10466 case PyUnicode_4BYTE_KIND:
10467 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010468 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 PyUnicode_GET_LENGTH(self), maxcount
10470 );
10471 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010472 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 }
10474
10475 if (PyUnicode_READY(substring) == -1)
10476 return NULL;
10477
10478 kind1 = PyUnicode_KIND(self);
10479 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 len1 = PyUnicode_GET_LENGTH(self);
10481 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010482 if (kind1 < kind2 || len1 < len2) {
10483 out = PyList_New(1);
10484 if (out == NULL)
10485 return NULL;
10486 Py_INCREF(self);
10487 PyList_SET_ITEM(out, 0, self);
10488 return out;
10489 }
10490 buf1 = PyUnicode_DATA(self);
10491 buf2 = PyUnicode_DATA(substring);
10492 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010493 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010494 if (!buf2)
10495 return NULL;
10496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010498 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010500 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10501 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010502 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010503 else
10504 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010505 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 break;
10507 case PyUnicode_2BYTE_KIND:
10508 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010509 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 break;
10511 case PyUnicode_4BYTE_KIND:
10512 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010513 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 break;
10515 default:
10516 out = NULL;
10517 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010518 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010519 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010520 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522}
10523
Alexander Belopolsky40018472011-02-26 01:02:56 +000010524static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010525rsplit(PyObject *self,
10526 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010527 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010528{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010529 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010530 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 Py_ssize_t len1, len2;
10532 PyObject* out;
10533
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010534 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010535 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (PyUnicode_READY(self) == -1)
10538 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010541 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010543 if (PyUnicode_IS_ASCII(self))
10544 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010545 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010546 PyUnicode_GET_LENGTH(self), maxcount
10547 );
10548 else
10549 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010550 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010551 PyUnicode_GET_LENGTH(self), maxcount
10552 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 case PyUnicode_2BYTE_KIND:
10554 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010555 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 PyUnicode_GET_LENGTH(self), maxcount
10557 );
10558 case PyUnicode_4BYTE_KIND:
10559 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010560 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 PyUnicode_GET_LENGTH(self), maxcount
10562 );
10563 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010564 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 }
10566
10567 if (PyUnicode_READY(substring) == -1)
10568 return NULL;
10569
10570 kind1 = PyUnicode_KIND(self);
10571 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 len1 = PyUnicode_GET_LENGTH(self);
10573 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010574 if (kind1 < kind2 || len1 < len2) {
10575 out = PyList_New(1);
10576 if (out == NULL)
10577 return NULL;
10578 Py_INCREF(self);
10579 PyList_SET_ITEM(out, 0, self);
10580 return out;
10581 }
10582 buf1 = PyUnicode_DATA(self);
10583 buf2 = PyUnicode_DATA(substring);
10584 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010585 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010586 if (!buf2)
10587 return NULL;
10588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010590 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10593 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010594 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010595 else
10596 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010597 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 break;
10599 case PyUnicode_2BYTE_KIND:
10600 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010601 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 break;
10603 case PyUnicode_4BYTE_KIND:
10604 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010605 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 break;
10607 default:
10608 out = NULL;
10609 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010610 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010611 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010612 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 return out;
10614}
10615
10616static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010617anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10618 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010620 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010622 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10623 return asciilib_find(buf1, len1, buf2, len2, offset);
10624 else
10625 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 case PyUnicode_2BYTE_KIND:
10627 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10628 case PyUnicode_4BYTE_KIND:
10629 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10630 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010631 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632}
10633
10634static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010635anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10636 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010638 switch (kind) {
10639 case PyUnicode_1BYTE_KIND:
10640 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10641 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10642 else
10643 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10644 case PyUnicode_2BYTE_KIND:
10645 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10646 case PyUnicode_4BYTE_KIND:
10647 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10648 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010649 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010650}
10651
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010652static void
10653replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10654 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10655{
10656 int kind = PyUnicode_KIND(u);
10657 void *data = PyUnicode_DATA(u);
10658 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10659 if (kind == PyUnicode_1BYTE_KIND) {
10660 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10661 (Py_UCS1 *)data + len,
10662 u1, u2, maxcount);
10663 }
10664 else if (kind == PyUnicode_2BYTE_KIND) {
10665 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10666 (Py_UCS2 *)data + len,
10667 u1, u2, maxcount);
10668 }
10669 else {
10670 assert(kind == PyUnicode_4BYTE_KIND);
10671 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10672 (Py_UCS4 *)data + len,
10673 u1, u2, maxcount);
10674 }
10675}
10676
Alexander Belopolsky40018472011-02-26 01:02:56 +000010677static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678replace(PyObject *self, PyObject *str1,
10679 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010682 const char *sbuf = PyUnicode_DATA(self);
10683 const void *buf1 = PyUnicode_DATA(str1);
10684 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 int srelease = 0, release1 = 0, release2 = 0;
10686 int skind = PyUnicode_KIND(self);
10687 int kind1 = PyUnicode_KIND(str1);
10688 int kind2 = PyUnicode_KIND(str2);
10689 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10690 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10691 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010692 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010693 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010695 if (slen < len1)
10696 goto nothing;
10697
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010700 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010701 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702
Victor Stinner59de0ee2011-10-07 10:01:28 +020010703 if (str1 == str2)
10704 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705
Victor Stinner49a0a212011-10-12 23:46:10 +020010706 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010707 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10708 if (maxchar < maxchar_str1)
10709 /* substring too wide to be present */
10710 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010711 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10712 /* Replacing str1 with str2 may cause a maxchar reduction in the
10713 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010714 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010715 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010718 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010720 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010723 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010724 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010725
Victor Stinner69ed0f42013-04-09 21:48:24 +020010726 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010727 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010728 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010729 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010730 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010734
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010735 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10736 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010737 }
10738 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 int rkind = skind;
10740 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010741 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 if (kind1 < rkind) {
10744 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010745 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 if (!buf1) goto error;
10747 release1 = 1;
10748 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010749 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010750 if (i < 0)
10751 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 if (rkind > kind2) {
10753 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010754 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 if (!buf2) goto error;
10756 release2 = 1;
10757 }
10758 else if (rkind < kind2) {
10759 /* widen self and buf1 */
10760 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010761 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010762 assert(buf1 != PyUnicode_DATA(str1));
10763 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010764 buf1 = PyUnicode_DATA(str1);
10765 release1 = 0;
10766 }
10767 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 if (!sbuf) goto error;
10769 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010770 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 if (!buf1) goto error;
10772 release1 = 1;
10773 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010774 u = PyUnicode_New(slen, maxchar);
10775 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010777 assert(PyUnicode_KIND(u) == rkind);
10778 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010779
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010780 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010781 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010782 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010784 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010786
10787 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010788 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010789 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010790 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010791 if (i == -1)
10792 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010793 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010795 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010799 }
10800 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010802 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 int rkind = skind;
10804 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010807 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010808 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 if (!buf1) goto error;
10810 release1 = 1;
10811 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010812 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010813 if (n == 0)
10814 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010816 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010817 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 if (!buf2) goto error;
10819 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010822 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010824 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 if (!sbuf) goto error;
10826 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010827 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010828 assert(buf1 != PyUnicode_DATA(str1));
10829 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010830 buf1 = PyUnicode_DATA(str1);
10831 release1 = 0;
10832 }
10833 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 if (!buf1) goto error;
10835 release1 = 1;
10836 }
10837 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10838 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010839 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 PyErr_SetString(PyExc_OverflowError,
10841 "replace string is too long");
10842 goto error;
10843 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010844 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010845 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010846 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010847 goto done;
10848 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010849 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 PyErr_SetString(PyExc_OverflowError,
10851 "replace string is too long");
10852 goto error;
10853 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010854 u = PyUnicode_New(new_size, maxchar);
10855 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010857 assert(PyUnicode_KIND(u) == rkind);
10858 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 ires = i = 0;
10860 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010861 while (n-- > 0) {
10862 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010863 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010864 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010865 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010866 if (j == -1)
10867 break;
10868 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010869 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010870 memcpy(res + rkind * ires,
10871 sbuf + rkind * i,
10872 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010874 }
10875 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010877 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010879 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010882 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010885 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010886 memcpy(res + rkind * ires,
10887 sbuf + rkind * i,
10888 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010889 }
10890 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010891 /* interleave */
10892 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010893 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010895 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010897 if (--n <= 0)
10898 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010899 memcpy(res + rkind * ires,
10900 sbuf + rkind * i,
10901 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 ires++;
10903 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010905 memcpy(res + rkind * ires,
10906 sbuf + rkind * i,
10907 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010909 }
10910
10911 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010912 unicode_adjust_maxchar(&u);
10913 if (u == NULL)
10914 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010916
10917 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010918 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10919 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10920 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010922 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010924 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010926 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010927 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010932 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10933 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10934 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010936 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010938 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010940 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010941 return unicode_result_unchanged(self);
10942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010944 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10945 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10946 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10947 if (srelease)
10948 PyMem_FREE((void *)sbuf);
10949 if (release1)
10950 PyMem_FREE((void *)buf1);
10951 if (release2)
10952 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954}
10955
10956/* --- Unicode Object Methods --------------------------------------------- */
10957
INADA Naoki3ae20562017-01-16 20:41:20 +090010958/*[clinic input]
10959str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
INADA Naoki3ae20562017-01-16 20:41:20 +090010961Return a version of the string where each word is titlecased.
10962
10963More specifically, words start with uppercased characters and all remaining
10964cased characters have lower case.
10965[clinic start generated code]*/
10966
10967static PyObject *
10968unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010969/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010971 if (PyUnicode_READY(self) == -1)
10972 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010973 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974}
10975
INADA Naoki3ae20562017-01-16 20:41:20 +090010976/*[clinic input]
10977str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978
INADA Naoki3ae20562017-01-16 20:41:20 +090010979Return a capitalized version of the string.
10980
10981More specifically, make the first character have upper case and the rest lower
10982case.
10983[clinic start generated code]*/
10984
10985static PyObject *
10986unicode_capitalize_impl(PyObject *self)
10987/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010989 if (PyUnicode_READY(self) == -1)
10990 return NULL;
10991 if (PyUnicode_GET_LENGTH(self) == 0)
10992 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010993 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994}
10995
INADA Naoki3ae20562017-01-16 20:41:20 +090010996/*[clinic input]
10997str.casefold as unicode_casefold
10998
10999Return a version of the string suitable for caseless comparisons.
11000[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011001
11002static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011003unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011004/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011005{
11006 if (PyUnicode_READY(self) == -1)
11007 return NULL;
11008 if (PyUnicode_IS_ASCII(self))
11009 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011010 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011011}
11012
11013
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011014/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011015
11016static int
11017convert_uc(PyObject *obj, void *addr)
11018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011020
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011021 if (!PyUnicode_Check(obj)) {
11022 PyErr_Format(PyExc_TypeError,
11023 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011024 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011025 return 0;
11026 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011027 if (PyUnicode_READY(obj) < 0)
11028 return 0;
11029 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011030 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011032 return 0;
11033 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011034 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011035 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011036}
11037
INADA Naoki3ae20562017-01-16 20:41:20 +090011038/*[clinic input]
11039str.center as unicode_center
11040
11041 width: Py_ssize_t
11042 fillchar: Py_UCS4 = ' '
11043 /
11044
11045Return a centered string of length width.
11046
11047Padding is done using the specified fill character (default is a space).
11048[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049
11050static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011051unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11052/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011054 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055
Benjamin Petersonbac79492012-01-14 13:34:47 -050011056 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 return NULL;
11058
Victor Stinnerc4b49542011-12-11 22:44:26 +010011059 if (PyUnicode_GET_LENGTH(self) >= width)
11060 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061
Victor Stinnerc4b49542011-12-11 22:44:26 +010011062 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 left = marg / 2 + (marg & width & 1);
11064
Victor Stinner9310abb2011-10-05 00:59:23 +020011065 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066}
11067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068/* This function assumes that str1 and str2 are readied by the caller. */
11069
Marc-André Lemburge5034372000-08-08 08:04:29 +000011070static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011071unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011072{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011073#define COMPARE(TYPE1, TYPE2) \
11074 do { \
11075 TYPE1* p1 = (TYPE1 *)data1; \
11076 TYPE2* p2 = (TYPE2 *)data2; \
11077 TYPE1* end = p1 + len; \
11078 Py_UCS4 c1, c2; \
11079 for (; p1 != end; p1++, p2++) { \
11080 c1 = *p1; \
11081 c2 = *p2; \
11082 if (c1 != c2) \
11083 return (c1 < c2) ? -1 : 1; \
11084 } \
11085 } \
11086 while (0)
11087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011089 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011090 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 kind1 = PyUnicode_KIND(str1);
11093 kind2 = PyUnicode_KIND(str2);
11094 data1 = PyUnicode_DATA(str1);
11095 data2 = PyUnicode_DATA(str2);
11096 len1 = PyUnicode_GET_LENGTH(str1);
11097 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011098 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011099
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011100 switch(kind1) {
11101 case PyUnicode_1BYTE_KIND:
11102 {
11103 switch(kind2) {
11104 case PyUnicode_1BYTE_KIND:
11105 {
11106 int cmp = memcmp(data1, data2, len);
11107 /* normalize result of memcmp() into the range [-1; 1] */
11108 if (cmp < 0)
11109 return -1;
11110 if (cmp > 0)
11111 return 1;
11112 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011113 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011114 case PyUnicode_2BYTE_KIND:
11115 COMPARE(Py_UCS1, Py_UCS2);
11116 break;
11117 case PyUnicode_4BYTE_KIND:
11118 COMPARE(Py_UCS1, Py_UCS4);
11119 break;
11120 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011121 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011122 }
11123 break;
11124 }
11125 case PyUnicode_2BYTE_KIND:
11126 {
11127 switch(kind2) {
11128 case PyUnicode_1BYTE_KIND:
11129 COMPARE(Py_UCS2, Py_UCS1);
11130 break;
11131 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011132 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011133 COMPARE(Py_UCS2, Py_UCS2);
11134 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011135 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011136 case PyUnicode_4BYTE_KIND:
11137 COMPARE(Py_UCS2, Py_UCS4);
11138 break;
11139 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011140 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011141 }
11142 break;
11143 }
11144 case PyUnicode_4BYTE_KIND:
11145 {
11146 switch(kind2) {
11147 case PyUnicode_1BYTE_KIND:
11148 COMPARE(Py_UCS4, Py_UCS1);
11149 break;
11150 case PyUnicode_2BYTE_KIND:
11151 COMPARE(Py_UCS4, Py_UCS2);
11152 break;
11153 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011154 {
11155#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11156 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11157 /* normalize result of wmemcmp() into the range [-1; 1] */
11158 if (cmp < 0)
11159 return -1;
11160 if (cmp > 0)
11161 return 1;
11162#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011163 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011164#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011165 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011166 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011167 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011168 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011169 }
11170 break;
11171 }
11172 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011173 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011174 }
11175
Victor Stinner770e19e2012-10-04 22:59:45 +020011176 if (len1 == len2)
11177 return 0;
11178 if (len1 < len2)
11179 return -1;
11180 else
11181 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011182
11183#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011184}
11185
Benjamin Peterson621b4302016-09-09 13:54:34 -070011186static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011187unicode_compare_eq(PyObject *str1, PyObject *str2)
11188{
11189 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011190 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011191 Py_ssize_t len;
11192 int cmp;
11193
Victor Stinnere5567ad2012-10-23 02:48:49 +020011194 len = PyUnicode_GET_LENGTH(str1);
11195 if (PyUnicode_GET_LENGTH(str2) != len)
11196 return 0;
11197 kind = PyUnicode_KIND(str1);
11198 if (PyUnicode_KIND(str2) != kind)
11199 return 0;
11200 data1 = PyUnicode_DATA(str1);
11201 data2 = PyUnicode_DATA(str2);
11202
11203 cmp = memcmp(data1, data2, len * kind);
11204 return (cmp == 0);
11205}
11206
11207
Alexander Belopolsky40018472011-02-26 01:02:56 +000011208int
11209PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11212 if (PyUnicode_READY(left) == -1 ||
11213 PyUnicode_READY(right) == -1)
11214 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011215
11216 /* a string is equal to itself */
11217 if (left == right)
11218 return 0;
11219
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011222 PyErr_Format(PyExc_TypeError,
11223 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011224 Py_TYPE(left)->tp_name,
11225 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226 return -1;
11227}
11228
Martin v. Löwis5b222132007-06-10 09:51:05 +000011229int
11230PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 Py_ssize_t i;
11233 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011235 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236
Victor Stinner910337b2011-10-03 03:20:16 +020011237 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011238 if (!PyUnicode_IS_READY(uni)) {
11239 const wchar_t *ws = _PyUnicode_WSTR(uni);
11240 /* Compare Unicode string and source character set string */
11241 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11242 if (chr != ustr[i])
11243 return (chr < ustr[i]) ? -1 : 1;
11244 }
11245 /* This check keeps Python strings that end in '\0' from comparing equal
11246 to C strings identical up to that point. */
11247 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11248 return 1; /* uni is longer */
11249 if (ustr[i])
11250 return -1; /* str is longer */
11251 return 0;
11252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011254 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011255 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011256 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011257 size_t len, len2 = strlen(str);
11258 int cmp;
11259
11260 len = Py_MIN(len1, len2);
11261 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011262 if (cmp != 0) {
11263 if (cmp < 0)
11264 return -1;
11265 else
11266 return 1;
11267 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011268 if (len1 > len2)
11269 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011270 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011271 return -1; /* str is longer */
11272 return 0;
11273 }
11274 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011275 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011276 /* Compare Unicode string and source character set string */
11277 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011278 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011279 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11280 /* This check keeps Python strings that end in '\0' from comparing equal
11281 to C strings identical up to that point. */
11282 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11283 return 1; /* uni is longer */
11284 if (str[i])
11285 return -1; /* str is longer */
11286 return 0;
11287 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011288}
11289
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011290static int
11291non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11292{
11293 size_t i, len;
11294 const wchar_t *p;
11295 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11296 if (strlen(str) != len)
11297 return 0;
11298 p = _PyUnicode_WSTR(unicode);
11299 assert(p);
11300 for (i = 0; i < len; i++) {
11301 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011302 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011303 return 0;
11304 }
11305 return 1;
11306}
11307
11308int
11309_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11310{
11311 size_t len;
11312 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011313 assert(str);
11314#ifndef NDEBUG
11315 for (const char *p = str; *p; p++) {
11316 assert((unsigned char)*p < 128);
11317 }
11318#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011319 if (PyUnicode_READY(unicode) == -1) {
11320 /* Memory error or bad data */
11321 PyErr_Clear();
11322 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11323 }
11324 if (!PyUnicode_IS_ASCII(unicode))
11325 return 0;
11326 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11327 return strlen(str) == len &&
11328 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11329}
11330
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011331int
11332_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11333{
11334 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011335
11336 assert(_PyUnicode_CHECK(left));
11337 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011338#ifndef NDEBUG
11339 for (const char *p = right->string; *p; p++) {
11340 assert((unsigned char)*p < 128);
11341 }
11342#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011343
11344 if (PyUnicode_READY(left) == -1) {
11345 /* memory error or bad data */
11346 PyErr_Clear();
11347 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11348 }
11349
11350 if (!PyUnicode_IS_ASCII(left))
11351 return 0;
11352
11353 right_uni = _PyUnicode_FromId(right); /* borrowed */
11354 if (right_uni == NULL) {
11355 /* memory error or bad data */
11356 PyErr_Clear();
11357 return _PyUnicode_EqualToASCIIString(left, right->string);
11358 }
11359
11360 if (left == right_uni)
11361 return 1;
11362
11363 if (PyUnicode_CHECK_INTERNED(left))
11364 return 0;
11365
Victor Stinner607b1022020-05-05 18:50:30 +020011366#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011367 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011368 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011369 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11370 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011371#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011372
11373 return unicode_compare_eq(left, right_uni);
11374}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011375
Alexander Belopolsky40018472011-02-26 01:02:56 +000011376PyObject *
11377PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011378{
11379 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011380
Victor Stinnere5567ad2012-10-23 02:48:49 +020011381 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11382 Py_RETURN_NOTIMPLEMENTED;
11383
11384 if (PyUnicode_READY(left) == -1 ||
11385 PyUnicode_READY(right) == -1)
11386 return NULL;
11387
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011388 if (left == right) {
11389 switch (op) {
11390 case Py_EQ:
11391 case Py_LE:
11392 case Py_GE:
11393 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011394 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011395 case Py_NE:
11396 case Py_LT:
11397 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011398 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011399 default:
11400 PyErr_BadArgument();
11401 return NULL;
11402 }
11403 }
11404 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011405 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011406 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011407 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011408 }
11409 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011410 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011411 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011412 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011413}
11414
Alexander Belopolsky40018472011-02-26 01:02:56 +000011415int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011416_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11417{
11418 return unicode_eq(aa, bb);
11419}
11420
11421int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011422PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011423{
Victor Stinner77282cb2013-04-14 19:22:47 +020011424 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011425 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011427 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011428
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011429 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011431 "'in <string>' requires string as left operand, not %.100s",
11432 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011433 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011434 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011435 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011436 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011437 if (ensure_unicode(str) < 0)
11438 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011441 kind2 = PyUnicode_KIND(substr);
11442 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011443 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011445 len2 = PyUnicode_GET_LENGTH(substr);
11446 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011447 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011448 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011450 if (len2 == 1) {
11451 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11452 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011453 return result;
11454 }
11455 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011456 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011457 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011458 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460
Victor Stinner77282cb2013-04-14 19:22:47 +020011461 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 case PyUnicode_1BYTE_KIND:
11463 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11464 break;
11465 case PyUnicode_2BYTE_KIND:
11466 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11467 break;
11468 case PyUnicode_4BYTE_KIND:
11469 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11470 break;
11471 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011472 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011474
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011475 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011476 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011477 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478
Guido van Rossum403d68b2000-03-13 15:55:09 +000011479 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011480}
11481
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482/* Concat to string or Unicode object giving a new Unicode object. */
11483
Alexander Belopolsky40018472011-02-26 01:02:56 +000011484PyObject *
11485PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011487 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011488 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011489 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011491 if (ensure_unicode(left) < 0)
11492 return NULL;
11493
11494 if (!PyUnicode_Check(right)) {
11495 PyErr_Format(PyExc_TypeError,
11496 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011497 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011498 return NULL;
11499 }
11500 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
11503 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011504 PyObject *empty = unicode_get_empty(); // Borrowed reference
11505 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011506 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011507 }
11508 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011509 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011512 left_len = PyUnicode_GET_LENGTH(left);
11513 right_len = PyUnicode_GET_LENGTH(right);
11514 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011515 PyErr_SetString(PyExc_OverflowError,
11516 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011517 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011518 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011519 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011520
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011521 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11522 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011523 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011526 result = PyUnicode_New(new_len, maxchar);
11527 if (result == NULL)
11528 return NULL;
11529 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11530 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11531 assert(_PyUnicode_CheckConsistency(result, 1));
11532 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533}
11534
Walter Dörwald1ab83302007-05-18 17:15:44 +000011535void
Victor Stinner23e56682011-10-03 03:54:37 +020011536PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011537{
Victor Stinner23e56682011-10-03 03:54:37 +020011538 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011539 Py_UCS4 maxchar, maxchar2;
11540 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011541
11542 if (p_left == NULL) {
11543 if (!PyErr_Occurred())
11544 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011545 return;
11546 }
Victor Stinner23e56682011-10-03 03:54:37 +020011547 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011548 if (right == NULL || left == NULL
11549 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011550 if (!PyErr_Occurred())
11551 PyErr_BadInternalCall();
11552 goto error;
11553 }
11554
Benjamin Petersonbac79492012-01-14 13:34:47 -050011555 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011556 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011557 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011558 goto error;
11559
Victor Stinner488fa492011-12-12 00:01:39 +010011560 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011561 PyObject *empty = unicode_get_empty(); // Borrowed reference
11562 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011563 Py_DECREF(left);
11564 Py_INCREF(right);
11565 *p_left = right;
11566 return;
11567 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011568 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011569 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011570 }
Victor Stinner488fa492011-12-12 00:01:39 +010011571
11572 left_len = PyUnicode_GET_LENGTH(left);
11573 right_len = PyUnicode_GET_LENGTH(right);
11574 if (left_len > PY_SSIZE_T_MAX - right_len) {
11575 PyErr_SetString(PyExc_OverflowError,
11576 "strings are too large to concat");
11577 goto error;
11578 }
11579 new_len = left_len + right_len;
11580
11581 if (unicode_modifiable(left)
11582 && PyUnicode_CheckExact(right)
11583 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011584 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11585 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011586 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011587 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011588 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11589 {
11590 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011591 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011592 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011593
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011594 /* copy 'right' into the newly allocated area of 'left' */
11595 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011596 }
Victor Stinner488fa492011-12-12 00:01:39 +010011597 else {
11598 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11599 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011600 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011601
Victor Stinner488fa492011-12-12 00:01:39 +010011602 /* Concat the two Unicode strings */
11603 res = PyUnicode_New(new_len, maxchar);
11604 if (res == NULL)
11605 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011606 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11607 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011608 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011609 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011610 }
11611 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011612 return;
11613
11614error:
Victor Stinner488fa492011-12-12 00:01:39 +010011615 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011616}
11617
11618void
11619PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11620{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011621 PyUnicode_Append(pleft, right);
11622 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011623}
11624
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011625/*
11626Wraps stringlib_parse_args_finds() and additionally ensures that the
11627first argument is a unicode object.
11628*/
11629
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011630static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011631parse_args_finds_unicode(const char * function_name, PyObject *args,
11632 PyObject **substring,
11633 Py_ssize_t *start, Py_ssize_t *end)
11634{
11635 if(stringlib_parse_args_finds(function_name, args, substring,
11636 start, end)) {
11637 if (ensure_unicode(*substring) < 0)
11638 return 0;
11639 return 1;
11640 }
11641 return 0;
11642}
11643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011644PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011648string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011649interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650
11651static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011652unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011654 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011655 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011656 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011658 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011659 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011662 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 kind1 = PyUnicode_KIND(self);
11666 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011667 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011668 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 len1 = PyUnicode_GET_LENGTH(self);
11671 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011673 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011674 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011675
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011676 buf1 = PyUnicode_DATA(self);
11677 buf2 = PyUnicode_DATA(substring);
11678 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011679 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011680 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011681 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011682 }
11683 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 case PyUnicode_1BYTE_KIND:
11685 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011686 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 buf2, len2, PY_SSIZE_T_MAX
11688 );
11689 break;
11690 case PyUnicode_2BYTE_KIND:
11691 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011692 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 buf2, len2, PY_SSIZE_T_MAX
11694 );
11695 break;
11696 case PyUnicode_4BYTE_KIND:
11697 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011698 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 buf2, len2, PY_SSIZE_T_MAX
11700 );
11701 break;
11702 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011703 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 }
11705
11706 result = PyLong_FromSsize_t(iresult);
11707
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011708 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011709 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011710 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 return result;
11713}
11714
INADA Naoki3ae20562017-01-16 20:41:20 +090011715/*[clinic input]
11716str.encode as unicode_encode
11717
11718 encoding: str(c_default="NULL") = 'utf-8'
11719 The encoding in which to encode the string.
11720 errors: str(c_default="NULL") = 'strict'
11721 The error handling scheme to use for encoding errors.
11722 The default is 'strict' meaning that encoding errors raise a
11723 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11724 'xmlcharrefreplace' as well as any other name registered with
11725 codecs.register_error that can handle UnicodeEncodeErrors.
11726
11727Encode the string using the codec registered for encoding.
11728[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729
11730static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011731unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011732/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011734 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011735}
11736
INADA Naoki3ae20562017-01-16 20:41:20 +090011737/*[clinic input]
11738str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739
INADA Naoki3ae20562017-01-16 20:41:20 +090011740 tabsize: int = 8
11741
11742Return a copy where all tab characters are expanded using spaces.
11743
11744If tabsize is not given, a tab size of 8 characters is assumed.
11745[clinic start generated code]*/
11746
11747static PyObject *
11748unicode_expandtabs_impl(PyObject *self, int tabsize)
11749/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011751 Py_ssize_t i, j, line_pos, src_len, incr;
11752 Py_UCS4 ch;
11753 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011754 const void *src_data;
11755 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011756 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011757 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
Antoine Pitrou22425222011-10-04 19:10:51 +020011759 if (PyUnicode_READY(self) == -1)
11760 return NULL;
11761
Thomas Wouters7e474022000-07-16 12:04:32 +000011762 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011763 src_len = PyUnicode_GET_LENGTH(self);
11764 i = j = line_pos = 0;
11765 kind = PyUnicode_KIND(self);
11766 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011767 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011768 for (; i < src_len; i++) {
11769 ch = PyUnicode_READ(kind, src_data, i);
11770 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011771 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011773 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011775 goto overflow;
11776 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011778 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011782 goto overflow;
11783 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011785 if (ch == '\n' || ch == '\r')
11786 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011788 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011789 if (!found)
11790 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011791
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011793 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 if (!u)
11795 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011796 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
Antoine Pitroue71d5742011-10-04 15:55:09 +020011798 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
Antoine Pitroue71d5742011-10-04 15:55:09 +020011800 for (; i < src_len; i++) {
11801 ch = PyUnicode_READ(kind, src_data, i);
11802 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011804 incr = tabsize - (line_pos % tabsize);
11805 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011806 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011807 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011809 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011811 line_pos++;
11812 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011813 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011814 if (ch == '\n' || ch == '\r')
11815 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011817 }
11818 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011819 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011820
Antoine Pitroue71d5742011-10-04 15:55:09 +020011821 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011822 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824}
11825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011826PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828\n\
11829Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011830such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831arguments start and end are interpreted as in slice notation.\n\
11832\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011833Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
11835static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011838 /* initialize variables to prevent gcc warning */
11839 PyObject *substring = NULL;
11840 Py_ssize_t start = 0;
11841 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011842 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011844 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011847 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011850 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (result == -2)
11853 return NULL;
11854
Christian Heimes217cfd12007-12-02 14:31:20 +000011855 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856}
11857
11858static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011859unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011861 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011862 enum PyUnicode_Kind kind;
11863 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011864
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011865 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011866 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011868 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011869 if (PyUnicode_READY(self) == -1) {
11870 return NULL;
11871 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011872 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11873 PyErr_SetString(PyExc_IndexError, "string index out of range");
11874 return NULL;
11875 }
11876 kind = PyUnicode_KIND(self);
11877 data = PyUnicode_DATA(self);
11878 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011879 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880}
11881
Guido van Rossumc2504932007-09-18 19:42:40 +000011882/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011883 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011884static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011885unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011887 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011888
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011889#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011890 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011891#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 if (_PyUnicode_HASH(self) != -1)
11893 return _PyUnicode_HASH(self);
11894 if (PyUnicode_READY(self) == -1)
11895 return -1;
animalizea1d14252019-01-02 20:16:06 +080011896
Christian Heimes985ecdc2013-11-20 11:46:18 +010011897 x = _Py_HashBytes(PyUnicode_DATA(self),
11898 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011900 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901}
11902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011903PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905\n\
oldkaa0735f2018-02-02 16:52:55 +080011906Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011907such that sub is contained within S[start:end]. Optional\n\
11908arguments start and end are interpreted as in slice notation.\n\
11909\n\
11910Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
11912static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011915 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011916 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011917 PyObject *substring = NULL;
11918 Py_ssize_t start = 0;
11919 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011921 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011924 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011927 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 if (result == -2)
11930 return NULL;
11931
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 if (result < 0) {
11933 PyErr_SetString(PyExc_ValueError, "substring not found");
11934 return NULL;
11935 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011936
Christian Heimes217cfd12007-12-02 14:31:20 +000011937 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938}
11939
INADA Naoki3ae20562017-01-16 20:41:20 +090011940/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011941str.isascii as unicode_isascii
11942
11943Return True if all characters in the string are ASCII, False otherwise.
11944
11945ASCII characters have code points in the range U+0000-U+007F.
11946Empty string is ASCII too.
11947[clinic start generated code]*/
11948
11949static PyObject *
11950unicode_isascii_impl(PyObject *self)
11951/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11952{
11953 if (PyUnicode_READY(self) == -1) {
11954 return NULL;
11955 }
11956 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11957}
11958
11959/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011960str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
INADA Naoki3ae20562017-01-16 20:41:20 +090011962Return True if the string is a lowercase string, False otherwise.
11963
11964A string is lowercase if all cased characters in the string are lowercase and
11965there is at least one cased character in the string.
11966[clinic start generated code]*/
11967
11968static PyObject *
11969unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011970/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 Py_ssize_t i, length;
11973 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011974 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975 int cased;
11976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (PyUnicode_READY(self) == -1)
11978 return NULL;
11979 length = PyUnicode_GET_LENGTH(self);
11980 kind = PyUnicode_KIND(self);
11981 data = PyUnicode_DATA(self);
11982
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (length == 1)
11985 return PyBool_FromLong(
11986 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011988 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011990 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011991
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 for (i = 0; i < length; i++) {
11994 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011995
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011997 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 else if (!cased && Py_UNICODE_ISLOWER(ch))
11999 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012001 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002}
12003
INADA Naoki3ae20562017-01-16 20:41:20 +090012004/*[clinic input]
12005str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
INADA Naoki3ae20562017-01-16 20:41:20 +090012007Return True if the string is an uppercase string, False otherwise.
12008
12009A string is uppercase if all cased characters in the string are uppercase and
12010there is at least one cased character in the string.
12011[clinic start generated code]*/
12012
12013static PyObject *
12014unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012015/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 Py_ssize_t i, length;
12018 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012019 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020 int cased;
12021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 if (PyUnicode_READY(self) == -1)
12023 return NULL;
12024 length = PyUnicode_GET_LENGTH(self);
12025 kind = PyUnicode_KIND(self);
12026 data = PyUnicode_DATA(self);
12027
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 if (length == 1)
12030 return PyBool_FromLong(
12031 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012033 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012035 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012036
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 for (i = 0; i < length; i++) {
12039 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012040
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012043 else if (!cased && Py_UNICODE_ISUPPER(ch))
12044 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012046 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047}
12048
INADA Naoki3ae20562017-01-16 20:41:20 +090012049/*[clinic input]
12050str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
INADA Naoki3ae20562017-01-16 20:41:20 +090012052Return True if the string is a title-cased string, False otherwise.
12053
12054In a title-cased string, upper- and title-case characters may only
12055follow uncased characters and lowercase characters only cased ones.
12056[clinic start generated code]*/
12057
12058static PyObject *
12059unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012060/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 Py_ssize_t i, length;
12063 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012064 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065 int cased, previous_is_cased;
12066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (PyUnicode_READY(self) == -1)
12068 return NULL;
12069 length = PyUnicode_GET_LENGTH(self);
12070 kind = PyUnicode_KIND(self);
12071 data = PyUnicode_DATA(self);
12072
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 if (length == 1) {
12075 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12076 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12077 (Py_UNICODE_ISUPPER(ch) != 0));
12078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012080 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012082 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012083
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 cased = 0;
12085 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 for (i = 0; i < length; i++) {
12087 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012088
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12090 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012091 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 previous_is_cased = 1;
12093 cased = 1;
12094 }
12095 else if (Py_UNICODE_ISLOWER(ch)) {
12096 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012097 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012098 previous_is_cased = 1;
12099 cased = 1;
12100 }
12101 else
12102 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012104 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105}
12106
INADA Naoki3ae20562017-01-16 20:41:20 +090012107/*[clinic input]
12108str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
INADA Naoki3ae20562017-01-16 20:41:20 +090012110Return True if the string is a whitespace string, False otherwise.
12111
12112A string is whitespace if all characters in the string are whitespace and there
12113is at least one character in the string.
12114[clinic start generated code]*/
12115
12116static PyObject *
12117unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012118/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 Py_ssize_t i, length;
12121 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012122 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123
12124 if (PyUnicode_READY(self) == -1)
12125 return NULL;
12126 length = PyUnicode_GET_LENGTH(self);
12127 kind = PyUnicode_KIND(self);
12128 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 if (length == 1)
12132 return PyBool_FromLong(
12133 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012135 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012137 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 for (i = 0; i < length; i++) {
12140 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012141 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012142 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012144 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145}
12146
INADA Naoki3ae20562017-01-16 20:41:20 +090012147/*[clinic input]
12148str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012149
INADA Naoki3ae20562017-01-16 20:41:20 +090012150Return True if the string is an alphabetic string, False otherwise.
12151
12152A string is alphabetic if all characters in the string are alphabetic and there
12153is at least one character in the string.
12154[clinic start generated code]*/
12155
12156static PyObject *
12157unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012158/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 Py_ssize_t i, length;
12161 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012162 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163
12164 if (PyUnicode_READY(self) == -1)
12165 return NULL;
12166 length = PyUnicode_GET_LENGTH(self);
12167 kind = PyUnicode_KIND(self);
12168 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012169
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012170 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 if (length == 1)
12172 return PyBool_FromLong(
12173 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012174
12175 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012177 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 for (i = 0; i < length; i++) {
12180 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012181 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012182 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012183 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012184}
12185
INADA Naoki3ae20562017-01-16 20:41:20 +090012186/*[clinic input]
12187str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012188
INADA Naoki3ae20562017-01-16 20:41:20 +090012189Return True if the string is an alpha-numeric string, False otherwise.
12190
12191A string is alpha-numeric if all characters in the string are alpha-numeric and
12192there is at least one character in the string.
12193[clinic start generated code]*/
12194
12195static PyObject *
12196unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012197/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012198{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012200 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 Py_ssize_t len, i;
12202
12203 if (PyUnicode_READY(self) == -1)
12204 return NULL;
12205
12206 kind = PyUnicode_KIND(self);
12207 data = PyUnicode_DATA(self);
12208 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012209
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012210 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 if (len == 1) {
12212 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12213 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12214 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012215
12216 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012218 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 for (i = 0; i < len; i++) {
12221 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012222 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012223 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012224 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012225 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012226}
12227
INADA Naoki3ae20562017-01-16 20:41:20 +090012228/*[clinic input]
12229str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230
INADA Naoki3ae20562017-01-16 20:41:20 +090012231Return True if the string is a decimal string, False otherwise.
12232
12233A string is a decimal string if all characters in the string are decimal and
12234there is at least one character in the string.
12235[clinic start generated code]*/
12236
12237static PyObject *
12238unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012239/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 Py_ssize_t i, length;
12242 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012243 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244
12245 if (PyUnicode_READY(self) == -1)
12246 return NULL;
12247 length = PyUnicode_GET_LENGTH(self);
12248 kind = PyUnicode_KIND(self);
12249 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 if (length == 1)
12253 return PyBool_FromLong(
12254 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012256 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012258 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 for (i = 0; i < length; i++) {
12261 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012262 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012264 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
INADA Naoki3ae20562017-01-16 20:41:20 +090012267/*[clinic input]
12268str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
INADA Naoki3ae20562017-01-16 20:41:20 +090012270Return True if the string is a digit string, False otherwise.
12271
12272A string is a digit string if all characters in the string are digits and there
12273is at least one character in the string.
12274[clinic start generated code]*/
12275
12276static PyObject *
12277unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012278/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 Py_ssize_t i, length;
12281 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012282 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283
12284 if (PyUnicode_READY(self) == -1)
12285 return NULL;
12286 length = PyUnicode_GET_LENGTH(self);
12287 kind = PyUnicode_KIND(self);
12288 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 if (length == 1) {
12292 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12293 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012296 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012298 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 for (i = 0; i < length; i++) {
12301 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012302 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012304 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305}
12306
INADA Naoki3ae20562017-01-16 20:41:20 +090012307/*[clinic input]
12308str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309
INADA Naoki3ae20562017-01-16 20:41:20 +090012310Return True if the string is a numeric string, False otherwise.
12311
12312A string is numeric if all characters in the string are numeric and there is at
12313least one character in the string.
12314[clinic start generated code]*/
12315
12316static PyObject *
12317unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012318/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 Py_ssize_t i, length;
12321 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012322 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323
12324 if (PyUnicode_READY(self) == -1)
12325 return NULL;
12326 length = PyUnicode_GET_LENGTH(self);
12327 kind = PyUnicode_KIND(self);
12328 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 if (length == 1)
12332 return PyBool_FromLong(
12333 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012335 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012337 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 for (i = 0; i < length; i++) {
12340 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012341 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012343 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344}
12345
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012346Py_ssize_t
12347_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012350 if (PyUnicode_READY(self) == -1)
12351 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012352
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012353 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012354 if (len == 0) {
12355 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 }
12358
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012359 int kind = PyUnicode_KIND(self);
12360 const void *data = PyUnicode_DATA(self);
12361 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012362 /* PEP 3131 says that the first character must be in
12363 XID_Start and subsequent characters in XID_Continue,
12364 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012365 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012366 letters, digits, underscore). However, given the current
12367 definition of XID_Start and XID_Continue, it is sufficient
12368 to check just for these, except that _ must be allowed
12369 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012370 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012371 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012372 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012373
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012374 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012375 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012376 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012377 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012378 }
12379 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012380 return i;
12381}
12382
12383int
12384PyUnicode_IsIdentifier(PyObject *self)
12385{
12386 if (PyUnicode_IS_READY(self)) {
12387 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12388 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12389 /* an empty string is not a valid identifier */
12390 return len && i == len;
12391 }
12392 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012393_Py_COMP_DIAG_PUSH
12394_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012395 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012396 if (len == 0) {
12397 /* an empty string is not a valid identifier */
12398 return 0;
12399 }
12400
12401 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012402 Py_UCS4 ch = wstr[i++];
12403#if SIZEOF_WCHAR_T == 2
12404 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12405 && i < len
12406 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12407 {
12408 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12409 i++;
12410 }
12411#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012412 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12413 return 0;
12414 }
12415
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012416 while (i < len) {
12417 ch = wstr[i++];
12418#if SIZEOF_WCHAR_T == 2
12419 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12420 && i < len
12421 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12422 {
12423 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12424 i++;
12425 }
12426#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012427 if (!_PyUnicode_IsXidContinue(ch)) {
12428 return 0;
12429 }
12430 }
12431 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012432_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012433 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012434}
12435
INADA Naoki3ae20562017-01-16 20:41:20 +090012436/*[clinic input]
12437str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012438
INADA Naoki3ae20562017-01-16 20:41:20 +090012439Return True if the string is a valid Python identifier, False otherwise.
12440
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012441Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012442such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012443[clinic start generated code]*/
12444
12445static PyObject *
12446unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012447/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012448{
12449 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12450}
12451
INADA Naoki3ae20562017-01-16 20:41:20 +090012452/*[clinic input]
12453str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012454
INADA Naoki3ae20562017-01-16 20:41:20 +090012455Return True if the string is printable, False otherwise.
12456
12457A string is printable if all of its characters are considered printable in
12458repr() or if it is empty.
12459[clinic start generated code]*/
12460
12461static PyObject *
12462unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012463/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 Py_ssize_t i, length;
12466 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012467 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468
12469 if (PyUnicode_READY(self) == -1)
12470 return NULL;
12471 length = PyUnicode_GET_LENGTH(self);
12472 kind = PyUnicode_KIND(self);
12473 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012474
12475 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 if (length == 1)
12477 return PyBool_FromLong(
12478 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 for (i = 0; i < length; i++) {
12481 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012482 Py_RETURN_FALSE;
12483 }
12484 }
12485 Py_RETURN_TRUE;
12486}
12487
INADA Naoki3ae20562017-01-16 20:41:20 +090012488/*[clinic input]
12489str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
INADA Naoki3ae20562017-01-16 20:41:20 +090012491 iterable: object
12492 /
12493
12494Concatenate any number of strings.
12495
Martin Panter91a88662017-01-24 00:30:06 +000012496The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012497The result is returned as a new string.
12498
12499Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12500[clinic start generated code]*/
12501
12502static PyObject *
12503unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012504/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505{
INADA Naoki3ae20562017-01-16 20:41:20 +090012506 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507}
12508
Martin v. Löwis18e16552006-02-15 17:27:45 +000012509static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012510unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 if (PyUnicode_READY(self) == -1)
12513 return -1;
12514 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515}
12516
INADA Naoki3ae20562017-01-16 20:41:20 +090012517/*[clinic input]
12518str.ljust as unicode_ljust
12519
12520 width: Py_ssize_t
12521 fillchar: Py_UCS4 = ' '
12522 /
12523
12524Return a left-justified string of length width.
12525
12526Padding is done using the specified fill character (default is a space).
12527[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
12529static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012530unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12531/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012533 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 if (PyUnicode_GET_LENGTH(self) >= width)
12537 return unicode_result_unchanged(self);
12538
12539 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540}
12541
INADA Naoki3ae20562017-01-16 20:41:20 +090012542/*[clinic input]
12543str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
INADA Naoki3ae20562017-01-16 20:41:20 +090012545Return a copy of the string converted to lowercase.
12546[clinic start generated code]*/
12547
12548static PyObject *
12549unicode_lower_impl(PyObject *self)
12550/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012552 if (PyUnicode_READY(self) == -1)
12553 return NULL;
12554 if (PyUnicode_IS_ASCII(self))
12555 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012556 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557}
12558
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012559#define LEFTSTRIP 0
12560#define RIGHTSTRIP 1
12561#define BOTHSTRIP 2
12562
12563/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012564static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012565
INADA Naoki3ae20562017-01-16 20:41:20 +090012566#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012567
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012568/* externally visible for str.strip(unicode) */
12569PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012570_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012571{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012572 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 int kind;
12574 Py_ssize_t i, j, len;
12575 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012576 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12579 return NULL;
12580
12581 kind = PyUnicode_KIND(self);
12582 data = PyUnicode_DATA(self);
12583 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012584 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12586 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012587 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012588
Benjamin Peterson14339b62009-01-31 16:36:08 +000012589 i = 0;
12590 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012591 while (i < len) {
12592 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12593 if (!BLOOM(sepmask, ch))
12594 break;
12595 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12596 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 i++;
12598 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012599 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012600
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 j = len;
12602 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012603 j--;
12604 while (j >= i) {
12605 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12606 if (!BLOOM(sepmask, ch))
12607 break;
12608 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12609 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012611 }
12612
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012614 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012615
Victor Stinner7931d9a2011-11-04 00:22:48 +010012616 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617}
12618
12619PyObject*
12620PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12621{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012622 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012624 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625
Victor Stinnerde636f32011-10-01 03:55:54 +020012626 if (PyUnicode_READY(self) == -1)
12627 return NULL;
12628
Victor Stinner684d5fd2012-05-03 02:32:34 +020012629 length = PyUnicode_GET_LENGTH(self);
12630 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012631
Victor Stinner684d5fd2012-05-03 02:32:34 +020012632 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012633 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634
Victor Stinnerde636f32011-10-01 03:55:54 +020012635 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012636 PyErr_SetString(PyExc_IndexError, "string index out of range");
12637 return NULL;
12638 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012639 if (start >= length || end < start)
12640 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012641
Victor Stinner684d5fd2012-05-03 02:32:34 +020012642 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012643 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012644 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012645 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012646 }
12647 else {
12648 kind = PyUnicode_KIND(self);
12649 data = PyUnicode_1BYTE_DATA(self);
12650 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012651 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012652 length);
12653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655
12656static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 Py_ssize_t len, i, j;
12660
12661 if (PyUnicode_READY(self) == -1)
12662 return NULL;
12663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012665
Victor Stinnercc7af722013-04-09 22:39:24 +020012666 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012667 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012668
12669 i = 0;
12670 if (striptype != RIGHTSTRIP) {
12671 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012672 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012673 if (!_Py_ascii_whitespace[ch])
12674 break;
12675 i++;
12676 }
12677 }
12678
12679 j = len;
12680 if (striptype != LEFTSTRIP) {
12681 j--;
12682 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012683 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012684 if (!_Py_ascii_whitespace[ch])
12685 break;
12686 j--;
12687 }
12688 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012689 }
12690 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012691 else {
12692 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012693 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012694
Victor Stinnercc7af722013-04-09 22:39:24 +020012695 i = 0;
12696 if (striptype != RIGHTSTRIP) {
12697 while (i < len) {
12698 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12699 if (!Py_UNICODE_ISSPACE(ch))
12700 break;
12701 i++;
12702 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012703 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012704
12705 j = len;
12706 if (striptype != LEFTSTRIP) {
12707 j--;
12708 while (j >= i) {
12709 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12710 if (!Py_UNICODE_ISSPACE(ch))
12711 break;
12712 j--;
12713 }
12714 j++;
12715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012716 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012717
Victor Stinner7931d9a2011-11-04 00:22:48 +010012718 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719}
12720
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012721
12722static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012723do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012724{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012725 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 if (PyUnicode_Check(sep))
12727 return _PyUnicode_XStrip(self, striptype, sep);
12728 else {
12729 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 "%s arg must be None or str",
12731 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012732 return NULL;
12733 }
12734 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012735
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012737}
12738
12739
INADA Naoki3ae20562017-01-16 20:41:20 +090012740/*[clinic input]
12741str.strip as unicode_strip
12742
12743 chars: object = None
12744 /
12745
Zachary Ware09895c22019-10-09 16:09:00 -050012746Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012747
12748If chars is given and not None, remove characters in chars instead.
12749[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012750
12751static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012752unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012753/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012754{
INADA Naoki3ae20562017-01-16 20:41:20 +090012755 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012756}
12757
12758
INADA Naoki3ae20562017-01-16 20:41:20 +090012759/*[clinic input]
12760str.lstrip as unicode_lstrip
12761
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012762 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012763 /
12764
12765Return a copy of the string with leading whitespace removed.
12766
12767If chars is given and not None, remove characters in chars instead.
12768[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012769
12770static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012771unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012772/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012773{
INADA Naoki3ae20562017-01-16 20:41:20 +090012774 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012775}
12776
12777
INADA Naoki3ae20562017-01-16 20:41:20 +090012778/*[clinic input]
12779str.rstrip as unicode_rstrip
12780
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012781 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012782 /
12783
12784Return a copy of the string with trailing whitespace removed.
12785
12786If chars is given and not None, remove characters in chars instead.
12787[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012788
12789static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012790unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012791/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012792{
INADA Naoki3ae20562017-01-16 20:41:20 +090012793 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012794}
12795
12796
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012798unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012800 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802
Serhiy Storchaka05997252013-01-26 12:14:02 +020012803 if (len < 1)
12804 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
Victor Stinnerc4b49542011-12-11 22:44:26 +010012806 /* no repeat, return original string */
12807 if (len == 1)
12808 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012809
Benjamin Petersonbac79492012-01-14 13:34:47 -050012810 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 return NULL;
12812
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012813 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012814 PyErr_SetString(PyExc_OverflowError,
12815 "repeated string is too long");
12816 return NULL;
12817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012819
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012820 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821 if (!u)
12822 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012823 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012826 int kind = PyUnicode_KIND(str);
12827 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012828 if (kind == PyUnicode_1BYTE_KIND) {
12829 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012830 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012831 }
12832 else if (kind == PyUnicode_2BYTE_KIND) {
12833 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012834 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012835 ucs2[n] = fill_char;
12836 } else {
12837 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12838 assert(kind == PyUnicode_4BYTE_KIND);
12839 for (n = 0; n < len; ++n)
12840 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 }
12843 else {
12844 /* number of characters copied this far */
12845 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012846 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012848 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012852 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012853 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855 }
12856
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012857 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012858 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859}
12860
Alexander Belopolsky40018472011-02-26 01:02:56 +000012861PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012862PyUnicode_Replace(PyObject *str,
12863 PyObject *substr,
12864 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012865 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012867 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12868 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012870 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871}
12872
INADA Naoki3ae20562017-01-16 20:41:20 +090012873/*[clinic input]
12874str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875
INADA Naoki3ae20562017-01-16 20:41:20 +090012876 old: unicode
12877 new: unicode
12878 count: Py_ssize_t = -1
12879 Maximum number of occurrences to replace.
12880 -1 (the default value) means replace all occurrences.
12881 /
12882
12883Return a copy with all occurrences of substring old replaced by new.
12884
12885If the optional argument count is given, only the first count occurrences are
12886replaced.
12887[clinic start generated code]*/
12888
12889static PyObject *
12890unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12891 Py_ssize_t count)
12892/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012894 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012895 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012896 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897}
12898
sweeneydea81849b2020-04-22 17:05:48 -040012899/*[clinic input]
12900str.removeprefix as unicode_removeprefix
12901
12902 prefix: unicode
12903 /
12904
12905Return a str with the given prefix string removed if present.
12906
12907If the string starts with the prefix string, return string[len(prefix):].
12908Otherwise, return a copy of the original string.
12909[clinic start generated code]*/
12910
12911static PyObject *
12912unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12913/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12914{
12915 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12916 if (match == -1) {
12917 return NULL;
12918 }
12919 if (match) {
12920 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12921 PyUnicode_GET_LENGTH(self));
12922 }
12923 return unicode_result_unchanged(self);
12924}
12925
12926/*[clinic input]
12927str.removesuffix as unicode_removesuffix
12928
12929 suffix: unicode
12930 /
12931
12932Return a str with the given suffix string removed if present.
12933
12934If the string ends with the suffix string and that suffix is not empty,
12935return string[:-len(suffix)]. Otherwise, return a copy of the original
12936string.
12937[clinic start generated code]*/
12938
12939static PyObject *
12940unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12941/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12942{
12943 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12944 if (match == -1) {
12945 return NULL;
12946 }
12947 if (match) {
12948 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12949 - PyUnicode_GET_LENGTH(suffix));
12950 }
12951 return unicode_result_unchanged(self);
12952}
12953
Alexander Belopolsky40018472011-02-26 01:02:56 +000012954static PyObject *
12955unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012957 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 Py_ssize_t isize;
12959 Py_ssize_t osize, squote, dquote, i, o;
12960 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012961 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012962 const void *idata;
12963 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012966 return NULL;
12967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 isize = PyUnicode_GET_LENGTH(unicode);
12969 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 /* Compute length of output, quote characters, and
12972 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012973 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 max = 127;
12975 squote = dquote = 0;
12976 ikind = PyUnicode_KIND(unicode);
12977 for (i = 0; i < isize; i++) {
12978 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012979 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012981 case '\'': squote++; break;
12982 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012984 incr = 2;
12985 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 default:
12987 /* Fast-path ASCII */
12988 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012989 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012991 ;
12992 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012995 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012997 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012999 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013001 if (osize > PY_SSIZE_T_MAX - incr) {
13002 PyErr_SetString(PyExc_OverflowError,
13003 "string is too long to generate repr");
13004 return NULL;
13005 }
13006 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 }
13008
13009 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013010 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013012 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 if (dquote)
13014 /* Both squote and dquote present. Use squote,
13015 and escape them */
13016 osize += squote;
13017 else
13018 quote = '"';
13019 }
Victor Stinner55c08782013-04-14 18:45:39 +020013020 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021
13022 repr = PyUnicode_New(osize, max);
13023 if (repr == NULL)
13024 return NULL;
13025 okind = PyUnicode_KIND(repr);
13026 odata = PyUnicode_DATA(repr);
13027
13028 PyUnicode_WRITE(okind, odata, 0, quote);
13029 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013030 if (unchanged) {
13031 _PyUnicode_FastCopyCharacters(repr, 1,
13032 unicode, 0,
13033 isize);
13034 }
13035 else {
13036 for (i = 0, o = 1; i < isize; i++) {
13037 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038
Victor Stinner55c08782013-04-14 18:45:39 +020013039 /* Escape quotes and backslashes */
13040 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013041 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013043 continue;
13044 }
13045
13046 /* Map special whitespace to '\t', \n', '\r' */
13047 if (ch == '\t') {
13048 PyUnicode_WRITE(okind, odata, o++, '\\');
13049 PyUnicode_WRITE(okind, odata, o++, 't');
13050 }
13051 else if (ch == '\n') {
13052 PyUnicode_WRITE(okind, odata, o++, '\\');
13053 PyUnicode_WRITE(okind, odata, o++, 'n');
13054 }
13055 else if (ch == '\r') {
13056 PyUnicode_WRITE(okind, odata, o++, '\\');
13057 PyUnicode_WRITE(okind, odata, o++, 'r');
13058 }
13059
13060 /* Map non-printable US ASCII to '\xhh' */
13061 else if (ch < ' ' || ch == 0x7F) {
13062 PyUnicode_WRITE(okind, odata, o++, '\\');
13063 PyUnicode_WRITE(okind, odata, o++, 'x');
13064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13065 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13066 }
13067
13068 /* Copy ASCII characters as-is */
13069 else if (ch < 0x7F) {
13070 PyUnicode_WRITE(okind, odata, o++, ch);
13071 }
13072
13073 /* Non-ASCII characters */
13074 else {
13075 /* Map Unicode whitespace and control characters
13076 (categories Z* and C* except ASCII space)
13077 */
13078 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13079 PyUnicode_WRITE(okind, odata, o++, '\\');
13080 /* Map 8-bit characters to '\xhh' */
13081 if (ch <= 0xff) {
13082 PyUnicode_WRITE(okind, odata, o++, 'x');
13083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13084 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13085 }
13086 /* Map 16-bit characters to '\uxxxx' */
13087 else if (ch <= 0xffff) {
13088 PyUnicode_WRITE(okind, odata, o++, 'u');
13089 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13090 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13092 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13093 }
13094 /* Map 21-bit characters to '\U00xxxxxx' */
13095 else {
13096 PyUnicode_WRITE(okind, odata, o++, 'U');
13097 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13098 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13099 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13100 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13105 }
13106 }
13107 /* Copy characters as-is */
13108 else {
13109 PyUnicode_WRITE(okind, odata, o++, ch);
13110 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013111 }
13112 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013115 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013116 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117}
13118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013119PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121\n\
13122Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013123such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124arguments start and end are interpreted as in slice notation.\n\
13125\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013126Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127
13128static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013131 /* initialize variables to prevent gcc warning */
13132 PyObject *substring = NULL;
13133 Py_ssize_t start = 0;
13134 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013135 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013137 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013138 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013140 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013143 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 if (result == -2)
13146 return NULL;
13147
Christian Heimes217cfd12007-12-02 14:31:20 +000013148 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149}
13150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013151PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013154Return the highest index in S where substring sub is found,\n\
13155such that sub is contained within S[start:end]. Optional\n\
13156arguments start and end are interpreted as in slice notation.\n\
13157\n\
13158Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159
13160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013163 /* initialize variables to prevent gcc warning */
13164 PyObject *substring = NULL;
13165 Py_ssize_t start = 0;
13166 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013167 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013169 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013172 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013175 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 if (result == -2)
13178 return NULL;
13179
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180 if (result < 0) {
13181 PyErr_SetString(PyExc_ValueError, "substring not found");
13182 return NULL;
13183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184
Christian Heimes217cfd12007-12-02 14:31:20 +000013185 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186}
13187
INADA Naoki3ae20562017-01-16 20:41:20 +090013188/*[clinic input]
13189str.rjust as unicode_rjust
13190
13191 width: Py_ssize_t
13192 fillchar: Py_UCS4 = ' '
13193 /
13194
13195Return a right-justified string of length width.
13196
13197Padding is done using the specified fill character (default is a space).
13198[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199
13200static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013201unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13202/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013204 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205 return NULL;
13206
Victor Stinnerc4b49542011-12-11 22:44:26 +010013207 if (PyUnicode_GET_LENGTH(self) >= width)
13208 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209
Victor Stinnerc4b49542011-12-11 22:44:26 +010013210 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211}
13212
Alexander Belopolsky40018472011-02-26 01:02:56 +000013213PyObject *
13214PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013216 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013219 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220}
13221
INADA Naoki3ae20562017-01-16 20:41:20 +090013222/*[clinic input]
13223str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224
INADA Naoki3ae20562017-01-16 20:41:20 +090013225 sep: object = None
13226 The delimiter according which to split the string.
13227 None (the default value) means split according to any whitespace,
13228 and discard empty strings from the result.
13229 maxsplit: Py_ssize_t = -1
13230 Maximum number of splits to do.
13231 -1 (the default value) means no limit.
13232
13233Return a list of the words in the string, using sep as the delimiter string.
13234[clinic start generated code]*/
13235
13236static PyObject *
13237unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13238/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239{
INADA Naoki3ae20562017-01-16 20:41:20 +090013240 if (sep == Py_None)
13241 return split(self, NULL, maxsplit);
13242 if (PyUnicode_Check(sep))
13243 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013244
Victor Stinner998b8062018-09-12 00:23:25 +020013245 PyErr_Format(PyExc_TypeError,
13246 "must be str or None, not %.100s",
13247 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249}
13250
Thomas Wouters477c8d52006-05-27 19:21:47 +000013251PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013252PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013253{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013254 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013255 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013256 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013258
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013259 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013260 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013261
Victor Stinner14f8f022011-10-05 20:58:25 +020013262 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 len1 = PyUnicode_GET_LENGTH(str_obj);
13265 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013266 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013267 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013268 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013269 }
13270 buf1 = PyUnicode_DATA(str_obj);
13271 buf2 = PyUnicode_DATA(sep_obj);
13272 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013273 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013274 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013275 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013278 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013280 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13281 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13282 else
13283 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013284 break;
13285 case PyUnicode_2BYTE_KIND:
13286 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13287 break;
13288 case PyUnicode_4BYTE_KIND:
13289 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13290 break;
13291 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013292 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013294
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013295 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013296 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013297 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013298
13299 return out;
13300}
13301
13302
13303PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013304PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013305{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013306 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013307 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013308 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013310
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013311 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013313
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013314 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 len1 = PyUnicode_GET_LENGTH(str_obj);
13317 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013318 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013319 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013320 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013321 }
13322 buf1 = PyUnicode_DATA(str_obj);
13323 buf2 = PyUnicode_DATA(sep_obj);
13324 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013325 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013326 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013327 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013330 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013332 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13333 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13334 else
13335 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 break;
13337 case PyUnicode_2BYTE_KIND:
13338 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13339 break;
13340 case PyUnicode_4BYTE_KIND:
13341 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13342 break;
13343 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013344 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013345 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013346
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013347 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013348 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013349 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013350
13351 return out;
13352}
13353
INADA Naoki3ae20562017-01-16 20:41:20 +090013354/*[clinic input]
13355str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013356
INADA Naoki3ae20562017-01-16 20:41:20 +090013357 sep: object
13358 /
13359
13360Partition the string into three parts using the given separator.
13361
13362This will search for the separator in the string. If the separator is found,
13363returns a 3-tuple containing the part before the separator, the separator
13364itself, and the part after it.
13365
13366If the separator is not found, returns a 3-tuple containing the original string
13367and two empty strings.
13368[clinic start generated code]*/
13369
13370static PyObject *
13371unicode_partition(PyObject *self, PyObject *sep)
13372/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013373{
INADA Naoki3ae20562017-01-16 20:41:20 +090013374 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013375}
13376
INADA Naoki3ae20562017-01-16 20:41:20 +090013377/*[clinic input]
13378str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013379
INADA Naoki3ae20562017-01-16 20:41:20 +090013380Partition the string into three parts using the given separator.
13381
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013382This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013383the separator is found, returns a 3-tuple containing the part before the
13384separator, the separator itself, and the part after it.
13385
13386If the separator is not found, returns a 3-tuple containing two empty strings
13387and the original string.
13388[clinic start generated code]*/
13389
13390static PyObject *
13391unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013392/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013393{
INADA Naoki3ae20562017-01-16 20:41:20 +090013394 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013395}
13396
Alexander Belopolsky40018472011-02-26 01:02:56 +000013397PyObject *
13398PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013399{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013400 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013401 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013402
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013404}
13405
INADA Naoki3ae20562017-01-16 20:41:20 +090013406/*[clinic input]
13407str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013408
INADA Naoki3ae20562017-01-16 20:41:20 +090013409Return a list of the words in the string, using sep as the delimiter string.
13410
13411Splits are done starting at the end of the string and working to the front.
13412[clinic start generated code]*/
13413
13414static PyObject *
13415unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13416/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013417{
INADA Naoki3ae20562017-01-16 20:41:20 +090013418 if (sep == Py_None)
13419 return rsplit(self, NULL, maxsplit);
13420 if (PyUnicode_Check(sep))
13421 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013422
Victor Stinner998b8062018-09-12 00:23:25 +020013423 PyErr_Format(PyExc_TypeError,
13424 "must be str or None, not %.100s",
13425 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013426 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013427}
13428
INADA Naoki3ae20562017-01-16 20:41:20 +090013429/*[clinic input]
13430str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013432 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013433
13434Return a list of the lines in the string, breaking at line boundaries.
13435
13436Line breaks are not included in the resulting list unless keepends is given and
13437true.
13438[clinic start generated code]*/
13439
13440static PyObject *
13441unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013442/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013444 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445}
13446
13447static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013448PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013450 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451}
13452
INADA Naoki3ae20562017-01-16 20:41:20 +090013453/*[clinic input]
13454str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013455
INADA Naoki3ae20562017-01-16 20:41:20 +090013456Convert uppercase characters to lowercase and lowercase characters to uppercase.
13457[clinic start generated code]*/
13458
13459static PyObject *
13460unicode_swapcase_impl(PyObject *self)
13461/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013462{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013463 if (PyUnicode_READY(self) == -1)
13464 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013465 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466}
13467
Larry Hastings61272b72014-01-07 12:41:53 -080013468/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013469
Larry Hastings31826802013-10-19 00:09:25 -070013470@staticmethod
13471str.maketrans as unicode_maketrans
13472
13473 x: object
13474
13475 y: unicode=NULL
13476
13477 z: unicode=NULL
13478
13479 /
13480
13481Return a translation table usable for str.translate().
13482
13483If there is only one argument, it must be a dictionary mapping Unicode
13484ordinals (integers) or characters to Unicode ordinals, strings or None.
13485Character keys will be then converted to ordinals.
13486If there are two arguments, they must be strings of equal length, and
13487in the resulting dictionary, each character in x will be mapped to the
13488character at the same position in y. If there is a third argument, it
13489must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013490[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013491
Larry Hastings31826802013-10-19 00:09:25 -070013492static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013493unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013494/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013495{
Georg Brandlceee0772007-11-27 23:48:05 +000013496 PyObject *new = NULL, *key, *value;
13497 Py_ssize_t i = 0;
13498 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013499
Georg Brandlceee0772007-11-27 23:48:05 +000013500 new = PyDict_New();
13501 if (!new)
13502 return NULL;
13503 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013505 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013506
Georg Brandlceee0772007-11-27 23:48:05 +000013507 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013508 if (!PyUnicode_Check(x)) {
13509 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13510 "be a string if there is a second argument");
13511 goto err;
13512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013513 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013514 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13515 "arguments must have equal length");
13516 goto err;
13517 }
13518 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519 x_kind = PyUnicode_KIND(x);
13520 y_kind = PyUnicode_KIND(y);
13521 x_data = PyUnicode_DATA(x);
13522 y_data = PyUnicode_DATA(y);
13523 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13524 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013525 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013526 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013527 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013528 if (!value) {
13529 Py_DECREF(key);
13530 goto err;
13531 }
Georg Brandlceee0772007-11-27 23:48:05 +000013532 res = PyDict_SetItem(new, key, value);
13533 Py_DECREF(key);
13534 Py_DECREF(value);
13535 if (res < 0)
13536 goto err;
13537 }
13538 /* create entries for deleting chars in z */
13539 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013540 z_kind = PyUnicode_KIND(z);
13541 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013542 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013544 if (!key)
13545 goto err;
13546 res = PyDict_SetItem(new, key, Py_None);
13547 Py_DECREF(key);
13548 if (res < 0)
13549 goto err;
13550 }
13551 }
13552 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013553 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013554 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555
Georg Brandlceee0772007-11-27 23:48:05 +000013556 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013557 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013558 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13559 "to maketrans it must be a dict");
13560 goto err;
13561 }
13562 /* copy entries into the new dict, converting string keys to int keys */
13563 while (PyDict_Next(x, &i, &key, &value)) {
13564 if (PyUnicode_Check(key)) {
13565 /* convert string keys to integer keys */
13566 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013567 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013568 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13569 "table must be of length 1");
13570 goto err;
13571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 kind = PyUnicode_KIND(key);
13573 data = PyUnicode_DATA(key);
13574 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013575 if (!newkey)
13576 goto err;
13577 res = PyDict_SetItem(new, newkey, value);
13578 Py_DECREF(newkey);
13579 if (res < 0)
13580 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013581 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013582 /* just keep integer keys */
13583 if (PyDict_SetItem(new, key, value) < 0)
13584 goto err;
13585 } else {
13586 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13587 "be strings or integers");
13588 goto err;
13589 }
13590 }
13591 }
13592 return new;
13593 err:
13594 Py_DECREF(new);
13595 return NULL;
13596}
13597
INADA Naoki3ae20562017-01-16 20:41:20 +090013598/*[clinic input]
13599str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013600
INADA Naoki3ae20562017-01-16 20:41:20 +090013601 table: object
13602 Translation table, which must be a mapping of Unicode ordinals to
13603 Unicode ordinals, strings, or None.
13604 /
13605
13606Replace each character in the string using the given translation table.
13607
13608The table must implement lookup/indexing via __getitem__, for instance a
13609dictionary or list. If this operation raises LookupError, the character is
13610left untouched. Characters mapped to None are deleted.
13611[clinic start generated code]*/
13612
13613static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013615/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618}
13619
INADA Naoki3ae20562017-01-16 20:41:20 +090013620/*[clinic input]
13621str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013622
INADA Naoki3ae20562017-01-16 20:41:20 +090013623Return a copy of the string converted to uppercase.
13624[clinic start generated code]*/
13625
13626static PyObject *
13627unicode_upper_impl(PyObject *self)
13628/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013629{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013630 if (PyUnicode_READY(self) == -1)
13631 return NULL;
13632 if (PyUnicode_IS_ASCII(self))
13633 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013634 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013635}
13636
INADA Naoki3ae20562017-01-16 20:41:20 +090013637/*[clinic input]
13638str.zfill as unicode_zfill
13639
13640 width: Py_ssize_t
13641 /
13642
13643Pad a numeric string with zeros on the left, to fill a field of the given width.
13644
13645The string is never truncated.
13646[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013647
13648static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013649unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013650/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013651{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013652 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013653 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013655 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013656 Py_UCS4 chr;
13657
Benjamin Petersonbac79492012-01-14 13:34:47 -050013658 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013660
Victor Stinnerc4b49542011-12-11 22:44:26 +010013661 if (PyUnicode_GET_LENGTH(self) >= width)
13662 return unicode_result_unchanged(self);
13663
13664 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013665
13666 u = pad(self, fill, 0, '0');
13667
Walter Dörwald068325e2002-04-15 13:36:47 +000013668 if (u == NULL)
13669 return NULL;
13670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013671 kind = PyUnicode_KIND(u);
13672 data = PyUnicode_DATA(u);
13673 chr = PyUnicode_READ(kind, data, fill);
13674
13675 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013676 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013677 PyUnicode_WRITE(kind, data, 0, chr);
13678 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679 }
13680
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013681 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013682 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013683}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684
13685#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013686static PyObject *
13687unicode__decimal2ascii(PyObject *self)
13688{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013689 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013690}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691#endif
13692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013693PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013695\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013696Return True if S starts with the specified prefix, False otherwise.\n\
13697With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013698With optional end, stop comparing S at that position.\n\
13699prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700
13701static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013702unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013703 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013704{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013705 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013706 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013707 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013708 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013709 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710
Jesus Ceaac451502011-04-20 17:09:23 +020013711 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013712 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013713 if (PyTuple_Check(subobj)) {
13714 Py_ssize_t i;
13715 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013716 substring = PyTuple_GET_ITEM(subobj, i);
13717 if (!PyUnicode_Check(substring)) {
13718 PyErr_Format(PyExc_TypeError,
13719 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013720 "not %.100s",
13721 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013722 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013723 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013724 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013725 if (result == -1)
13726 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013727 if (result) {
13728 Py_RETURN_TRUE;
13729 }
13730 }
13731 /* nothing matched */
13732 Py_RETURN_FALSE;
13733 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013734 if (!PyUnicode_Check(subobj)) {
13735 PyErr_Format(PyExc_TypeError,
13736 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013737 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013739 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013740 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013741 if (result == -1)
13742 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013743 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013744}
13745
13746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013747PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013749\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013750Return True if S ends with the specified suffix, False otherwise.\n\
13751With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013752With optional end, stop comparing S at that position.\n\
13753suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013754
13755static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013756unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013757 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013758{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013759 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013760 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013761 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013762 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013763 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013764
Jesus Ceaac451502011-04-20 17:09:23 +020013765 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013766 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013767 if (PyTuple_Check(subobj)) {
13768 Py_ssize_t i;
13769 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013770 substring = PyTuple_GET_ITEM(subobj, i);
13771 if (!PyUnicode_Check(substring)) {
13772 PyErr_Format(PyExc_TypeError,
13773 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013774 "not %.100s",
13775 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013776 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013777 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013778 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013779 if (result == -1)
13780 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013781 if (result) {
13782 Py_RETURN_TRUE;
13783 }
13784 }
13785 Py_RETURN_FALSE;
13786 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013787 if (!PyUnicode_Check(subobj)) {
13788 PyErr_Format(PyExc_TypeError,
13789 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013790 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013791 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013792 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013793 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013794 if (result == -1)
13795 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013796 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797}
13798
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013799static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013800_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013801{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013802 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13803 writer->data = PyUnicode_DATA(writer->buffer);
13804
13805 if (!writer->readonly) {
13806 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013807 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013808 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013809 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013810 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13811 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13812 writer->kind = PyUnicode_WCHAR_KIND;
13813 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13814
Victor Stinner8f674cc2013-04-17 23:02:17 +020013815 /* Copy-on-write mode: set buffer size to 0 so
13816 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13817 * next write. */
13818 writer->size = 0;
13819 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013820}
13821
Victor Stinnerd3f08822012-05-29 12:57:52 +020013822void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013823_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013824{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013825 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013826
13827 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013828 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013829
13830 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13831 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13832 writer->kind = PyUnicode_WCHAR_KIND;
13833 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013834}
13835
Inada Naoki770847a2019-06-24 12:30:24 +090013836// Initialize _PyUnicodeWriter with initial buffer
13837static inline void
13838_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13839{
13840 memset(writer, 0, sizeof(*writer));
13841 writer->buffer = buffer;
13842 _PyUnicodeWriter_Update(writer);
13843 writer->min_length = writer->size;
13844}
13845
Victor Stinnerd3f08822012-05-29 12:57:52 +020013846int
13847_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13848 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013849{
13850 Py_ssize_t newlen;
13851 PyObject *newbuffer;
13852
Victor Stinner2740e462016-09-06 16:58:36 -070013853 assert(maxchar <= MAX_UNICODE);
13854
Victor Stinnerca9381e2015-09-22 00:58:32 +020013855 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013856 assert((maxchar > writer->maxchar && length >= 0)
13857 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013858
Victor Stinner202fdca2012-05-07 12:47:02 +020013859 if (length > PY_SSIZE_T_MAX - writer->pos) {
13860 PyErr_NoMemory();
13861 return -1;
13862 }
13863 newlen = writer->pos + length;
13864
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013865 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013866
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013868 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013869 if (writer->overallocate
13870 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13871 /* overallocate to limit the number of realloc() */
13872 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013874 if (newlen < writer->min_length)
13875 newlen = writer->min_length;
13876
Victor Stinnerd3f08822012-05-29 12:57:52 +020013877 writer->buffer = PyUnicode_New(newlen, maxchar);
13878 if (writer->buffer == NULL)
13879 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013880 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013881 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013882 if (writer->overallocate
13883 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13884 /* overallocate to limit the number of realloc() */
13885 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013886 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013887 if (newlen < writer->min_length)
13888 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013889
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013890 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013891 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013892 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013893 newbuffer = PyUnicode_New(newlen, maxchar);
13894 if (newbuffer == NULL)
13895 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013896 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13897 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013898 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013899 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013900 }
13901 else {
13902 newbuffer = resize_compact(writer->buffer, newlen);
13903 if (newbuffer == NULL)
13904 return -1;
13905 }
13906 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013907 }
13908 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013909 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013910 newbuffer = PyUnicode_New(writer->size, maxchar);
13911 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013912 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013913 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13914 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013915 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013916 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013917 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013918 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013919
13920#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013921}
13922
Victor Stinnerca9381e2015-09-22 00:58:32 +020013923int
13924_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13925 enum PyUnicode_Kind kind)
13926{
13927 Py_UCS4 maxchar;
13928
13929 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13930 assert(writer->kind < kind);
13931
13932 switch (kind)
13933 {
13934 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13935 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13936 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13937 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013938 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013939 }
13940
13941 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13942}
13943
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013944static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013945_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013946{
Victor Stinner2740e462016-09-06 16:58:36 -070013947 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013948 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13949 return -1;
13950 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13951 writer->pos++;
13952 return 0;
13953}
13954
13955int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013956_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13957{
13958 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13959}
13960
13961int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013962_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13963{
13964 Py_UCS4 maxchar;
13965 Py_ssize_t len;
13966
13967 if (PyUnicode_READY(str) == -1)
13968 return -1;
13969 len = PyUnicode_GET_LENGTH(str);
13970 if (len == 0)
13971 return 0;
13972 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13973 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013974 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013975 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013976 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013977 Py_INCREF(str);
13978 writer->buffer = str;
13979 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013980 writer->pos += len;
13981 return 0;
13982 }
13983 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13984 return -1;
13985 }
13986 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13987 str, 0, len);
13988 writer->pos += len;
13989 return 0;
13990}
13991
Victor Stinnere215d962012-10-06 23:03:36 +020013992int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013993_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13994 Py_ssize_t start, Py_ssize_t end)
13995{
13996 Py_UCS4 maxchar;
13997 Py_ssize_t len;
13998
13999 if (PyUnicode_READY(str) == -1)
14000 return -1;
14001
14002 assert(0 <= start);
14003 assert(end <= PyUnicode_GET_LENGTH(str));
14004 assert(start <= end);
14005
14006 if (end == 0)
14007 return 0;
14008
14009 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14010 return _PyUnicodeWriter_WriteStr(writer, str);
14011
14012 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14013 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14014 else
14015 maxchar = writer->maxchar;
14016 len = end - start;
14017
14018 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14019 return -1;
14020
14021 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14022 str, start, len);
14023 writer->pos += len;
14024 return 0;
14025}
14026
14027int
Victor Stinner4a587072013-11-19 12:54:53 +010014028_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14029 const char *ascii, Py_ssize_t len)
14030{
14031 if (len == -1)
14032 len = strlen(ascii);
14033
Andy Lestere6be9b52020-02-11 20:28:35 -060014034 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014035
14036 if (writer->buffer == NULL && !writer->overallocate) {
14037 PyObject *str;
14038
14039 str = _PyUnicode_FromASCII(ascii, len);
14040 if (str == NULL)
14041 return -1;
14042
14043 writer->readonly = 1;
14044 writer->buffer = str;
14045 _PyUnicodeWriter_Update(writer);
14046 writer->pos += len;
14047 return 0;
14048 }
14049
14050 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14051 return -1;
14052
14053 switch (writer->kind)
14054 {
14055 case PyUnicode_1BYTE_KIND:
14056 {
14057 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14058 Py_UCS1 *data = writer->data;
14059
Christian Heimesf051e432016-09-13 20:22:02 +020014060 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014061 break;
14062 }
14063 case PyUnicode_2BYTE_KIND:
14064 {
14065 _PyUnicode_CONVERT_BYTES(
14066 Py_UCS1, Py_UCS2,
14067 ascii, ascii + len,
14068 (Py_UCS2 *)writer->data + writer->pos);
14069 break;
14070 }
14071 case PyUnicode_4BYTE_KIND:
14072 {
14073 _PyUnicode_CONVERT_BYTES(
14074 Py_UCS1, Py_UCS4,
14075 ascii, ascii + len,
14076 (Py_UCS4 *)writer->data + writer->pos);
14077 break;
14078 }
14079 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014080 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014081 }
14082
14083 writer->pos += len;
14084 return 0;
14085}
14086
14087int
14088_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14089 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014090{
14091 Py_UCS4 maxchar;
14092
Andy Lestere6be9b52020-02-11 20:28:35 -060014093 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014094 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14095 return -1;
14096 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14097 writer->pos += len;
14098 return 0;
14099}
14100
Victor Stinnerd3f08822012-05-29 12:57:52 +020014101PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014102_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014103{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014104 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014105
Victor Stinnerd3f08822012-05-29 12:57:52 +020014106 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014107 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014108 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014110
14111 str = writer->buffer;
14112 writer->buffer = NULL;
14113
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014114 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014115 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14116 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014117 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014118
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014119 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14120 PyObject *str2;
14121 str2 = resize_compact(str, writer->pos);
14122 if (str2 == NULL) {
14123 Py_DECREF(str);
14124 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014125 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014126 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014127 }
14128
Victor Stinner15a0bd32013-07-08 22:29:55 +020014129 assert(_PyUnicode_CheckConsistency(str, 1));
14130 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014131}
14132
Victor Stinnerd3f08822012-05-29 12:57:52 +020014133void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014134_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014135{
14136 Py_CLEAR(writer->buffer);
14137}
14138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014139#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014140
14141PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014142 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014143\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014144Return a formatted version of S, using substitutions from args and kwargs.\n\
14145The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014146
Eric Smith27bbca62010-11-04 17:06:58 +000014147PyDoc_STRVAR(format_map__doc__,
14148 "S.format_map(mapping) -> str\n\
14149\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014150Return a formatted version of S, using substitutions from mapping.\n\
14151The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014152
INADA Naoki3ae20562017-01-16 20:41:20 +090014153/*[clinic input]
14154str.__format__ as unicode___format__
14155
14156 format_spec: unicode
14157 /
14158
14159Return a formatted version of the string as described by format_spec.
14160[clinic start generated code]*/
14161
Eric Smith4a7d76d2008-05-30 18:10:19 +000014162static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014163unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014164/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014165{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014166 _PyUnicodeWriter writer;
14167 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014168
Victor Stinnerd3f08822012-05-29 12:57:52 +020014169 if (PyUnicode_READY(self) == -1)
14170 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014171 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014172 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14173 self, format_spec, 0,
14174 PyUnicode_GET_LENGTH(format_spec));
14175 if (ret == -1) {
14176 _PyUnicodeWriter_Dealloc(&writer);
14177 return NULL;
14178 }
14179 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014180}
14181
INADA Naoki3ae20562017-01-16 20:41:20 +090014182/*[clinic input]
14183str.__sizeof__ as unicode_sizeof
14184
14185Return the size of the string in memory, in bytes.
14186[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014187
14188static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014189unicode_sizeof_impl(PyObject *self)
14190/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192 Py_ssize_t size;
14193
14194 /* If it's a compact object, account for base structure +
14195 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014196 if (PyUnicode_IS_COMPACT_ASCII(self))
14197 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14198 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014199 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014200 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014201 else {
14202 /* If it is a two-block object, account for base object, and
14203 for character block if present. */
14204 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014205 if (_PyUnicode_DATA_ANY(self))
14206 size += (PyUnicode_GET_LENGTH(self) + 1) *
14207 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014208 }
14209 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014210 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014211 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14212 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14213 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14214 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014215
14216 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014217}
14218
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014219static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014220unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014221{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014222 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014223 if (!copy)
14224 return NULL;
14225 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014226}
14227
Guido van Rossumd57fd912000-03-10 22:53:23 +000014228static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014229 UNICODE_ENCODE_METHODDEF
14230 UNICODE_REPLACE_METHODDEF
14231 UNICODE_SPLIT_METHODDEF
14232 UNICODE_RSPLIT_METHODDEF
14233 UNICODE_JOIN_METHODDEF
14234 UNICODE_CAPITALIZE_METHODDEF
14235 UNICODE_CASEFOLD_METHODDEF
14236 UNICODE_TITLE_METHODDEF
14237 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014238 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014239 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014240 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014241 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014242 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014243 UNICODE_LJUST_METHODDEF
14244 UNICODE_LOWER_METHODDEF
14245 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014246 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14247 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014248 UNICODE_RJUST_METHODDEF
14249 UNICODE_RSTRIP_METHODDEF
14250 UNICODE_RPARTITION_METHODDEF
14251 UNICODE_SPLITLINES_METHODDEF
14252 UNICODE_STRIP_METHODDEF
14253 UNICODE_SWAPCASE_METHODDEF
14254 UNICODE_TRANSLATE_METHODDEF
14255 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014256 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14257 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014258 UNICODE_REMOVEPREFIX_METHODDEF
14259 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014260 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014261 UNICODE_ISLOWER_METHODDEF
14262 UNICODE_ISUPPER_METHODDEF
14263 UNICODE_ISTITLE_METHODDEF
14264 UNICODE_ISSPACE_METHODDEF
14265 UNICODE_ISDECIMAL_METHODDEF
14266 UNICODE_ISDIGIT_METHODDEF
14267 UNICODE_ISNUMERIC_METHODDEF
14268 UNICODE_ISALPHA_METHODDEF
14269 UNICODE_ISALNUM_METHODDEF
14270 UNICODE_ISIDENTIFIER_METHODDEF
14271 UNICODE_ISPRINTABLE_METHODDEF
14272 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014273 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014274 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014275 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014276 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014277 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014278#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014279 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014280 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014281#endif
14282
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014283 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014284 {NULL, NULL}
14285};
14286
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014287static PyObject *
14288unicode_mod(PyObject *v, PyObject *w)
14289{
Brian Curtindfc80e32011-08-10 20:28:54 -050014290 if (!PyUnicode_Check(v))
14291 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014292 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014293}
14294
14295static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014296 0, /*nb_add*/
14297 0, /*nb_subtract*/
14298 0, /*nb_multiply*/
14299 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014300};
14301
Guido van Rossumd57fd912000-03-10 22:53:23 +000014302static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 (lenfunc) unicode_length, /* sq_length */
14304 PyUnicode_Concat, /* sq_concat */
14305 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14306 (ssizeargfunc) unicode_getitem, /* sq_item */
14307 0, /* sq_slice */
14308 0, /* sq_ass_item */
14309 0, /* sq_ass_slice */
14310 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014311};
14312
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014313static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014314unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014315{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014316 if (PyUnicode_READY(self) == -1)
14317 return NULL;
14318
Victor Stinnera15e2602020-04-08 02:01:56 +020014319 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014320 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014321 if (i == -1 && PyErr_Occurred())
14322 return NULL;
14323 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014324 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014325 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014326 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014327 Py_ssize_t start, stop, step, slicelength, i;
14328 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014329 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014330 const void *src_data;
14331 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014332 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014333 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014334
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014335 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014336 return NULL;
14337 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014338 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14339 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014340
14341 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014342 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014343 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014344 slicelength == PyUnicode_GET_LENGTH(self)) {
14345 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014346 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014347 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014348 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014349 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014350 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014351 src_kind = PyUnicode_KIND(self);
14352 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014353 if (!PyUnicode_IS_ASCII(self)) {
14354 kind_limit = kind_maxchar_limit(src_kind);
14355 max_char = 0;
14356 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14357 ch = PyUnicode_READ(src_kind, src_data, cur);
14358 if (ch > max_char) {
14359 max_char = ch;
14360 if (max_char >= kind_limit)
14361 break;
14362 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014363 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014364 }
Victor Stinner55c99112011-10-13 01:17:06 +020014365 else
14366 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014367 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014368 if (result == NULL)
14369 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014370 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014371 dest_data = PyUnicode_DATA(result);
14372
14373 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014374 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14375 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014376 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014377 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014378 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014379 } else {
14380 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14381 return NULL;
14382 }
14383}
14384
14385static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014386 (lenfunc)unicode_length, /* mp_length */
14387 (binaryfunc)unicode_subscript, /* mp_subscript */
14388 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014389};
14390
Guido van Rossumd57fd912000-03-10 22:53:23 +000014391
Guido van Rossumd57fd912000-03-10 22:53:23 +000014392/* Helpers for PyUnicode_Format() */
14393
Victor Stinnera47082312012-10-04 02:19:54 +020014394struct unicode_formatter_t {
14395 PyObject *args;
14396 int args_owned;
14397 Py_ssize_t arglen, argidx;
14398 PyObject *dict;
14399
14400 enum PyUnicode_Kind fmtkind;
14401 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014402 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014403 PyObject *fmtstr;
14404
14405 _PyUnicodeWriter writer;
14406};
14407
14408struct unicode_format_arg_t {
14409 Py_UCS4 ch;
14410 int flags;
14411 Py_ssize_t width;
14412 int prec;
14413 int sign;
14414};
14415
Guido van Rossumd57fd912000-03-10 22:53:23 +000014416static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014417unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014418{
Victor Stinnera47082312012-10-04 02:19:54 +020014419 Py_ssize_t argidx = ctx->argidx;
14420
14421 if (argidx < ctx->arglen) {
14422 ctx->argidx++;
14423 if (ctx->arglen < 0)
14424 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014425 else
Victor Stinnera47082312012-10-04 02:19:54 +020014426 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014427 }
14428 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014429 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014430 return NULL;
14431}
14432
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014433/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014434
Victor Stinnera47082312012-10-04 02:19:54 +020014435/* Format a float into the writer if the writer is not NULL, or into *p_output
14436 otherwise.
14437
14438 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014439static int
Victor Stinnera47082312012-10-04 02:19:54 +020014440formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14441 PyObject **p_output,
14442 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014443{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014444 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014445 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014446 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014447 int prec;
14448 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014449
Guido van Rossumd57fd912000-03-10 22:53:23 +000014450 x = PyFloat_AsDouble(v);
14451 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014452 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014453
Victor Stinnera47082312012-10-04 02:19:54 +020014454 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014455 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014456 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014457
Victor Stinnera47082312012-10-04 02:19:54 +020014458 if (arg->flags & F_ALT)
14459 dtoa_flags = Py_DTSF_ALT;
14460 else
14461 dtoa_flags = 0;
14462 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014463 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014464 return -1;
14465 len = strlen(p);
14466 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014467 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014468 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014469 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014470 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014471 }
14472 else
14473 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014474 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014475 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014476}
14477
Victor Stinnerd0880d52012-04-27 23:40:13 +020014478/* formatlong() emulates the format codes d, u, o, x and X, and
14479 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14480 * Python's regular ints.
14481 * Return value: a new PyUnicodeObject*, or NULL if error.
14482 * The output string is of the form
14483 * "-"? ("0x" | "0X")? digit+
14484 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14485 * set in flags. The case of hex digits will be correct,
14486 * There will be at least prec digits, zero-filled on the left if
14487 * necessary to get that many.
14488 * val object to be converted
14489 * flags bitmask of format flags; only F_ALT is looked at
14490 * prec minimum number of digits; 0-fill on left if needed
14491 * type a character in [duoxX]; u acts the same as d
14492 *
14493 * CAUTION: o, x and X conversions on regular ints can never
14494 * produce a '-' sign, but can for Python's unbounded ints.
14495 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014496PyObject *
14497_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014498{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014499 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014500 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014501 Py_ssize_t i;
14502 int sign; /* 1 if '-', else 0 */
14503 int len; /* number of characters */
14504 Py_ssize_t llen;
14505 int numdigits; /* len == numnondigits + numdigits */
14506 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014507
Victor Stinnerd0880d52012-04-27 23:40:13 +020014508 /* Avoid exceeding SSIZE_T_MAX */
14509 if (prec > INT_MAX-3) {
14510 PyErr_SetString(PyExc_OverflowError,
14511 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014512 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014513 }
14514
14515 assert(PyLong_Check(val));
14516
14517 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014518 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014519 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014520 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014521 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014522 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014523 /* int and int subclasses should print numerically when a numeric */
14524 /* format code is used (see issue18780) */
14525 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014526 break;
14527 case 'o':
14528 numnondigits = 2;
14529 result = PyNumber_ToBase(val, 8);
14530 break;
14531 case 'x':
14532 case 'X':
14533 numnondigits = 2;
14534 result = PyNumber_ToBase(val, 16);
14535 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014536 }
14537 if (!result)
14538 return NULL;
14539
14540 assert(unicode_modifiable(result));
14541 assert(PyUnicode_IS_READY(result));
14542 assert(PyUnicode_IS_ASCII(result));
14543
14544 /* To modify the string in-place, there can only be one reference. */
14545 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014546 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014547 PyErr_BadInternalCall();
14548 return NULL;
14549 }
14550 buf = PyUnicode_DATA(result);
14551 llen = PyUnicode_GET_LENGTH(result);
14552 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014553 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014554 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014555 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014556 return NULL;
14557 }
14558 len = (int)llen;
14559 sign = buf[0] == '-';
14560 numnondigits += sign;
14561 numdigits = len - numnondigits;
14562 assert(numdigits > 0);
14563
14564 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014565 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014566 (type == 'o' || type == 'x' || type == 'X'))) {
14567 assert(buf[sign] == '0');
14568 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14569 buf[sign+1] == 'o');
14570 numnondigits -= 2;
14571 buf += 2;
14572 len -= 2;
14573 if (sign)
14574 buf[0] = '-';
14575 assert(len == numnondigits + numdigits);
14576 assert(numdigits > 0);
14577 }
14578
14579 /* Fill with leading zeroes to meet minimum width. */
14580 if (prec > numdigits) {
14581 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14582 numnondigits + prec);
14583 char *b1;
14584 if (!r1) {
14585 Py_DECREF(result);
14586 return NULL;
14587 }
14588 b1 = PyBytes_AS_STRING(r1);
14589 for (i = 0; i < numnondigits; ++i)
14590 *b1++ = *buf++;
14591 for (i = 0; i < prec - numdigits; i++)
14592 *b1++ = '0';
14593 for (i = 0; i < numdigits; i++)
14594 *b1++ = *buf++;
14595 *b1 = '\0';
14596 Py_DECREF(result);
14597 result = r1;
14598 buf = PyBytes_AS_STRING(result);
14599 len = numnondigits + prec;
14600 }
14601
14602 /* Fix up case for hex conversions. */
14603 if (type == 'X') {
14604 /* Need to convert all lower case letters to upper case.
14605 and need to convert 0x to 0X (and -0x to -0X). */
14606 for (i = 0; i < len; i++)
14607 if (buf[i] >= 'a' && buf[i] <= 'x')
14608 buf[i] -= 'a'-'A';
14609 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014610 if (!PyUnicode_Check(result)
14611 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014612 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014613 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014614 Py_DECREF(result);
14615 result = unicode;
14616 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014617 else if (len != PyUnicode_GET_LENGTH(result)) {
14618 if (PyUnicode_Resize(&result, len) < 0)
14619 Py_CLEAR(result);
14620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014621 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014622}
14623
Ethan Furmandf3ed242014-01-05 06:50:30 -080014624/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014625 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014626 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014627 * -1 and raise an exception on error */
14628static int
Victor Stinnera47082312012-10-04 02:19:54 +020014629mainformatlong(PyObject *v,
14630 struct unicode_format_arg_t *arg,
14631 PyObject **p_output,
14632 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014633{
14634 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014635 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014636
14637 if (!PyNumber_Check(v))
14638 goto wrongtype;
14639
Ethan Furman9ab74802014-03-21 06:38:46 -070014640 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014641 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014642 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014643 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014644 }
14645 else {
14646 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014647 }
14648 if (iobj == NULL ) {
14649 if (PyErr_ExceptionMatches(PyExc_TypeError))
14650 goto wrongtype;
14651 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014652 }
14653 assert(PyLong_Check(iobj));
14654 }
14655 else {
14656 iobj = v;
14657 Py_INCREF(iobj);
14658 }
14659
14660 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014661 && arg->width == -1 && arg->prec == -1
14662 && !(arg->flags & (F_SIGN | F_BLANK))
14663 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014664 {
14665 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014666 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014667 int base;
14668
Victor Stinnera47082312012-10-04 02:19:54 +020014669 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014670 {
14671 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014672 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014673 case 'd':
14674 case 'i':
14675 case 'u':
14676 base = 10;
14677 break;
14678 case 'o':
14679 base = 8;
14680 break;
14681 case 'x':
14682 case 'X':
14683 base = 16;
14684 break;
14685 }
14686
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014687 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14688 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014689 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014690 }
14691 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014692 return 1;
14693 }
14694
Ethan Furmanb95b5612015-01-23 20:05:18 -080014695 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014696 Py_DECREF(iobj);
14697 if (res == NULL)
14698 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014699 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014700 return 0;
14701
14702wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014703 switch(type)
14704 {
14705 case 'o':
14706 case 'x':
14707 case 'X':
14708 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014709 "%%%c format: an integer is required, "
14710 "not %.200s",
14711 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014712 break;
14713 default:
14714 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014715 "%%%c format: a number is required, "
14716 "not %.200s",
14717 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014718 break;
14719 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014720 return -1;
14721}
14722
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014723static Py_UCS4
14724formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014725{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014726 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014727 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014728 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014729 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014730 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014731 goto onError;
14732 }
14733 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014734 int overflow;
14735 long x = PyLong_AsLongAndOverflow(v, &overflow);
14736 if (x == -1 && PyErr_Occurred()) {
14737 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014738 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014739 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014740 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014741 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014742
Victor Stinner8faf8212011-12-08 22:14:11 +010014743 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014744 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014745 PyErr_SetString(PyExc_OverflowError,
14746 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014747 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014748 }
14749
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014750 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014751 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014752
Benjamin Peterson29060642009-01-31 22:14:21 +000014753 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014754 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014755 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014756 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014757}
14758
Victor Stinnera47082312012-10-04 02:19:54 +020014759/* Parse options of an argument: flags, width, precision.
14760 Handle also "%(name)" syntax.
14761
14762 Return 0 if the argument has been formatted into arg->str.
14763 Return 1 if the argument has been written into ctx->writer,
14764 Raise an exception and return -1 on error. */
14765static int
14766unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14767 struct unicode_format_arg_t *arg)
14768{
14769#define FORMAT_READ(ctx) \
14770 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14771
14772 PyObject *v;
14773
Victor Stinnera47082312012-10-04 02:19:54 +020014774 if (arg->ch == '(') {
14775 /* Get argument value from a dictionary. Example: "%(name)s". */
14776 Py_ssize_t keystart;
14777 Py_ssize_t keylen;
14778 PyObject *key;
14779 int pcount = 1;
14780
14781 if (ctx->dict == NULL) {
14782 PyErr_SetString(PyExc_TypeError,
14783 "format requires a mapping");
14784 return -1;
14785 }
14786 ++ctx->fmtpos;
14787 --ctx->fmtcnt;
14788 keystart = ctx->fmtpos;
14789 /* Skip over balanced parentheses */
14790 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14791 arg->ch = FORMAT_READ(ctx);
14792 if (arg->ch == ')')
14793 --pcount;
14794 else if (arg->ch == '(')
14795 ++pcount;
14796 ctx->fmtpos++;
14797 }
14798 keylen = ctx->fmtpos - keystart - 1;
14799 if (ctx->fmtcnt < 0 || pcount > 0) {
14800 PyErr_SetString(PyExc_ValueError,
14801 "incomplete format key");
14802 return -1;
14803 }
14804 key = PyUnicode_Substring(ctx->fmtstr,
14805 keystart, keystart + keylen);
14806 if (key == NULL)
14807 return -1;
14808 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014809 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014810 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014811 }
14812 ctx->args = PyObject_GetItem(ctx->dict, key);
14813 Py_DECREF(key);
14814 if (ctx->args == NULL)
14815 return -1;
14816 ctx->args_owned = 1;
14817 ctx->arglen = -1;
14818 ctx->argidx = -2;
14819 }
14820
14821 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014822 while (--ctx->fmtcnt >= 0) {
14823 arg->ch = FORMAT_READ(ctx);
14824 ctx->fmtpos++;
14825 switch (arg->ch) {
14826 case '-': arg->flags |= F_LJUST; continue;
14827 case '+': arg->flags |= F_SIGN; continue;
14828 case ' ': arg->flags |= F_BLANK; continue;
14829 case '#': arg->flags |= F_ALT; continue;
14830 case '0': arg->flags |= F_ZERO; continue;
14831 }
14832 break;
14833 }
14834
14835 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014836 if (arg->ch == '*') {
14837 v = unicode_format_getnextarg(ctx);
14838 if (v == NULL)
14839 return -1;
14840 if (!PyLong_Check(v)) {
14841 PyErr_SetString(PyExc_TypeError,
14842 "* wants int");
14843 return -1;
14844 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014845 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014846 if (arg->width == -1 && PyErr_Occurred())
14847 return -1;
14848 if (arg->width < 0) {
14849 arg->flags |= F_LJUST;
14850 arg->width = -arg->width;
14851 }
14852 if (--ctx->fmtcnt >= 0) {
14853 arg->ch = FORMAT_READ(ctx);
14854 ctx->fmtpos++;
14855 }
14856 }
14857 else if (arg->ch >= '0' && arg->ch <= '9') {
14858 arg->width = arg->ch - '0';
14859 while (--ctx->fmtcnt >= 0) {
14860 arg->ch = FORMAT_READ(ctx);
14861 ctx->fmtpos++;
14862 if (arg->ch < '0' || arg->ch > '9')
14863 break;
14864 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14865 mixing signed and unsigned comparison. Since arg->ch is between
14866 '0' and '9', casting to int is safe. */
14867 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14868 PyErr_SetString(PyExc_ValueError,
14869 "width too big");
14870 return -1;
14871 }
14872 arg->width = arg->width*10 + (arg->ch - '0');
14873 }
14874 }
14875
14876 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014877 if (arg->ch == '.') {
14878 arg->prec = 0;
14879 if (--ctx->fmtcnt >= 0) {
14880 arg->ch = FORMAT_READ(ctx);
14881 ctx->fmtpos++;
14882 }
14883 if (arg->ch == '*') {
14884 v = unicode_format_getnextarg(ctx);
14885 if (v == NULL)
14886 return -1;
14887 if (!PyLong_Check(v)) {
14888 PyErr_SetString(PyExc_TypeError,
14889 "* wants int");
14890 return -1;
14891 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014892 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014893 if (arg->prec == -1 && PyErr_Occurred())
14894 return -1;
14895 if (arg->prec < 0)
14896 arg->prec = 0;
14897 if (--ctx->fmtcnt >= 0) {
14898 arg->ch = FORMAT_READ(ctx);
14899 ctx->fmtpos++;
14900 }
14901 }
14902 else if (arg->ch >= '0' && arg->ch <= '9') {
14903 arg->prec = arg->ch - '0';
14904 while (--ctx->fmtcnt >= 0) {
14905 arg->ch = FORMAT_READ(ctx);
14906 ctx->fmtpos++;
14907 if (arg->ch < '0' || arg->ch > '9')
14908 break;
14909 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14910 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014911 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014912 return -1;
14913 }
14914 arg->prec = arg->prec*10 + (arg->ch - '0');
14915 }
14916 }
14917 }
14918
14919 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14920 if (ctx->fmtcnt >= 0) {
14921 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14922 if (--ctx->fmtcnt >= 0) {
14923 arg->ch = FORMAT_READ(ctx);
14924 ctx->fmtpos++;
14925 }
14926 }
14927 }
14928 if (ctx->fmtcnt < 0) {
14929 PyErr_SetString(PyExc_ValueError,
14930 "incomplete format");
14931 return -1;
14932 }
14933 return 0;
14934
14935#undef FORMAT_READ
14936}
14937
14938/* Format one argument. Supported conversion specifiers:
14939
14940 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014941 - "i", "d", "u": int or float
14942 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014943 - "e", "E", "f", "F", "g", "G": float
14944 - "c": int or str (1 character)
14945
Victor Stinner8dbd4212012-12-04 09:30:24 +010014946 When possible, the output is written directly into the Unicode writer
14947 (ctx->writer). A string is created when padding is required.
14948
Victor Stinnera47082312012-10-04 02:19:54 +020014949 Return 0 if the argument has been formatted into *p_str,
14950 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014951 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014952static int
14953unicode_format_arg_format(struct unicode_formatter_t *ctx,
14954 struct unicode_format_arg_t *arg,
14955 PyObject **p_str)
14956{
14957 PyObject *v;
14958 _PyUnicodeWriter *writer = &ctx->writer;
14959
14960 if (ctx->fmtcnt == 0)
14961 ctx->writer.overallocate = 0;
14962
Victor Stinnera47082312012-10-04 02:19:54 +020014963 v = unicode_format_getnextarg(ctx);
14964 if (v == NULL)
14965 return -1;
14966
Victor Stinnera47082312012-10-04 02:19:54 +020014967
14968 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014969 case 's':
14970 case 'r':
14971 case 'a':
14972 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14973 /* Fast path */
14974 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14975 return -1;
14976 return 1;
14977 }
14978
14979 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14980 *p_str = v;
14981 Py_INCREF(*p_str);
14982 }
14983 else {
14984 if (arg->ch == 's')
14985 *p_str = PyObject_Str(v);
14986 else if (arg->ch == 'r')
14987 *p_str = PyObject_Repr(v);
14988 else
14989 *p_str = PyObject_ASCII(v);
14990 }
14991 break;
14992
14993 case 'i':
14994 case 'd':
14995 case 'u':
14996 case 'o':
14997 case 'x':
14998 case 'X':
14999 {
15000 int ret = mainformatlong(v, arg, p_str, writer);
15001 if (ret != 0)
15002 return ret;
15003 arg->sign = 1;
15004 break;
15005 }
15006
15007 case 'e':
15008 case 'E':
15009 case 'f':
15010 case 'F':
15011 case 'g':
15012 case 'G':
15013 if (arg->width == -1 && arg->prec == -1
15014 && !(arg->flags & (F_SIGN | F_BLANK)))
15015 {
15016 /* Fast path */
15017 if (formatfloat(v, arg, NULL, writer) == -1)
15018 return -1;
15019 return 1;
15020 }
15021
15022 arg->sign = 1;
15023 if (formatfloat(v, arg, p_str, NULL) == -1)
15024 return -1;
15025 break;
15026
15027 case 'c':
15028 {
15029 Py_UCS4 ch = formatchar(v);
15030 if (ch == (Py_UCS4) -1)
15031 return -1;
15032 if (arg->width == -1 && arg->prec == -1) {
15033 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015034 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015035 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015036 return 1;
15037 }
15038 *p_str = PyUnicode_FromOrdinal(ch);
15039 break;
15040 }
15041
15042 default:
15043 PyErr_Format(PyExc_ValueError,
15044 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015045 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015046 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15047 (int)arg->ch,
15048 ctx->fmtpos - 1);
15049 return -1;
15050 }
15051 if (*p_str == NULL)
15052 return -1;
15053 assert (PyUnicode_Check(*p_str));
15054 return 0;
15055}
15056
15057static int
15058unicode_format_arg_output(struct unicode_formatter_t *ctx,
15059 struct unicode_format_arg_t *arg,
15060 PyObject *str)
15061{
15062 Py_ssize_t len;
15063 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015064 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015065 Py_ssize_t pindex;
15066 Py_UCS4 signchar;
15067 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015068 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015069 Py_ssize_t sublen;
15070 _PyUnicodeWriter *writer = &ctx->writer;
15071 Py_UCS4 fill;
15072
15073 fill = ' ';
15074 if (arg->sign && arg->flags & F_ZERO)
15075 fill = '0';
15076
15077 if (PyUnicode_READY(str) == -1)
15078 return -1;
15079
15080 len = PyUnicode_GET_LENGTH(str);
15081 if ((arg->width == -1 || arg->width <= len)
15082 && (arg->prec == -1 || arg->prec >= len)
15083 && !(arg->flags & (F_SIGN | F_BLANK)))
15084 {
15085 /* Fast path */
15086 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15087 return -1;
15088 return 0;
15089 }
15090
15091 /* Truncate the string for "s", "r" and "a" formats
15092 if the precision is set */
15093 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15094 if (arg->prec >= 0 && len > arg->prec)
15095 len = arg->prec;
15096 }
15097
15098 /* Adjust sign and width */
15099 kind = PyUnicode_KIND(str);
15100 pbuf = PyUnicode_DATA(str);
15101 pindex = 0;
15102 signchar = '\0';
15103 if (arg->sign) {
15104 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15105 if (ch == '-' || ch == '+') {
15106 signchar = ch;
15107 len--;
15108 pindex++;
15109 }
15110 else if (arg->flags & F_SIGN)
15111 signchar = '+';
15112 else if (arg->flags & F_BLANK)
15113 signchar = ' ';
15114 else
15115 arg->sign = 0;
15116 }
15117 if (arg->width < len)
15118 arg->width = len;
15119
15120 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015121 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015122 if (!(arg->flags & F_LJUST)) {
15123 if (arg->sign) {
15124 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015125 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015126 }
15127 else {
15128 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015129 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015130 }
15131 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015132 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15133 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015134 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015135 }
15136
Victor Stinnera47082312012-10-04 02:19:54 +020015137 buflen = arg->width;
15138 if (arg->sign && len == arg->width)
15139 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015140 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015141 return -1;
15142
15143 /* Write the sign if needed */
15144 if (arg->sign) {
15145 if (fill != ' ') {
15146 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15147 writer->pos += 1;
15148 }
15149 if (arg->width > len)
15150 arg->width--;
15151 }
15152
15153 /* Write the numeric prefix for "x", "X" and "o" formats
15154 if the alternate form is used.
15155 For example, write "0x" for the "%#x" format. */
15156 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15157 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15158 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15159 if (fill != ' ') {
15160 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15161 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15162 writer->pos += 2;
15163 pindex += 2;
15164 }
15165 arg->width -= 2;
15166 if (arg->width < 0)
15167 arg->width = 0;
15168 len -= 2;
15169 }
15170
15171 /* Pad left with the fill character if needed */
15172 if (arg->width > len && !(arg->flags & F_LJUST)) {
15173 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015174 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015175 writer->pos += sublen;
15176 arg->width = len;
15177 }
15178
15179 /* If padding with spaces: write sign if needed and/or numeric prefix if
15180 the alternate form is used */
15181 if (fill == ' ') {
15182 if (arg->sign) {
15183 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15184 writer->pos += 1;
15185 }
15186 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15187 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15188 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15189 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15190 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15191 writer->pos += 2;
15192 pindex += 2;
15193 }
15194 }
15195
15196 /* Write characters */
15197 if (len) {
15198 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15199 str, pindex, len);
15200 writer->pos += len;
15201 }
15202
15203 /* Pad right with the fill character if needed */
15204 if (arg->width > len) {
15205 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015206 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015207 writer->pos += sublen;
15208 }
15209 return 0;
15210}
15211
15212/* Helper of PyUnicode_Format(): format one arg.
15213 Return 0 on success, raise an exception and return -1 on error. */
15214static int
15215unicode_format_arg(struct unicode_formatter_t *ctx)
15216{
15217 struct unicode_format_arg_t arg;
15218 PyObject *str;
15219 int ret;
15220
Victor Stinner8dbd4212012-12-04 09:30:24 +010015221 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015222 if (arg.ch == '%') {
15223 ctx->fmtpos++;
15224 ctx->fmtcnt--;
15225 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15226 return -1;
15227 return 0;
15228 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015229 arg.flags = 0;
15230 arg.width = -1;
15231 arg.prec = -1;
15232 arg.sign = 0;
15233 str = NULL;
15234
Victor Stinnera47082312012-10-04 02:19:54 +020015235 ret = unicode_format_arg_parse(ctx, &arg);
15236 if (ret == -1)
15237 return -1;
15238
15239 ret = unicode_format_arg_format(ctx, &arg, &str);
15240 if (ret == -1)
15241 return -1;
15242
15243 if (ret != 1) {
15244 ret = unicode_format_arg_output(ctx, &arg, str);
15245 Py_DECREF(str);
15246 if (ret == -1)
15247 return -1;
15248 }
15249
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015250 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015251 PyErr_SetString(PyExc_TypeError,
15252 "not all arguments converted during string formatting");
15253 return -1;
15254 }
15255 return 0;
15256}
15257
Alexander Belopolsky40018472011-02-26 01:02:56 +000015258PyObject *
15259PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015260{
Victor Stinnera47082312012-10-04 02:19:54 +020015261 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015262
Guido van Rossumd57fd912000-03-10 22:53:23 +000015263 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015264 PyErr_BadInternalCall();
15265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015266 }
Victor Stinnera47082312012-10-04 02:19:54 +020015267
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015268 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015269 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015270
15271 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015272 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15273 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15274 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15275 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015276
Victor Stinner8f674cc2013-04-17 23:02:17 +020015277 _PyUnicodeWriter_Init(&ctx.writer);
15278 ctx.writer.min_length = ctx.fmtcnt + 100;
15279 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015280
Guido van Rossumd57fd912000-03-10 22:53:23 +000015281 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015282 ctx.arglen = PyTuple_Size(args);
15283 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015284 }
15285 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015286 ctx.arglen = -1;
15287 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015288 }
Victor Stinnera47082312012-10-04 02:19:54 +020015289 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015290 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015291 ctx.dict = args;
15292 else
15293 ctx.dict = NULL;
15294 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015295
Victor Stinnera47082312012-10-04 02:19:54 +020015296 while (--ctx.fmtcnt >= 0) {
15297 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015298 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015299
15300 nonfmtpos = ctx.fmtpos++;
15301 while (ctx.fmtcnt >= 0 &&
15302 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15303 ctx.fmtpos++;
15304 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015305 }
Victor Stinnera47082312012-10-04 02:19:54 +020015306 if (ctx.fmtcnt < 0) {
15307 ctx.fmtpos--;
15308 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015309 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015310
Victor Stinnercfc4c132013-04-03 01:48:39 +020015311 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15312 nonfmtpos, ctx.fmtpos) < 0)
15313 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015314 }
15315 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015316 ctx.fmtpos++;
15317 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015318 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015319 }
15320 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015321
Victor Stinnera47082312012-10-04 02:19:54 +020015322 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015323 PyErr_SetString(PyExc_TypeError,
15324 "not all arguments converted during string formatting");
15325 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015326 }
15327
Victor Stinnera47082312012-10-04 02:19:54 +020015328 if (ctx.args_owned) {
15329 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015330 }
Victor Stinnera47082312012-10-04 02:19:54 +020015331 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015332
Benjamin Peterson29060642009-01-31 22:14:21 +000015333 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015334 _PyUnicodeWriter_Dealloc(&ctx.writer);
15335 if (ctx.args_owned) {
15336 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015337 }
15338 return NULL;
15339}
15340
Jeremy Hylton938ace62002-07-17 16:30:39 +000015341static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015342unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15343
Tim Peters6d6c1a32001-08-02 04:15:00 +000015344static PyObject *
15345unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15346{
Benjamin Peterson29060642009-01-31 22:14:21 +000015347 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 static char *kwlist[] = {"object", "encoding", "errors", 0};
15349 char *encoding = NULL;
15350 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015351
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 if (type != &PyUnicode_Type)
15353 return unicode_subtype_new(type, args, kwds);
15354 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015355 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 return NULL;
15357 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015358 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 if (encoding == NULL && errors == NULL)
15360 return PyObject_Str(x);
15361 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015362 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015363}
15364
Guido van Rossume023fe02001-08-30 03:12:59 +000015365static PyObject *
15366unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15367{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015368 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015369 Py_ssize_t length, char_size;
15370 int share_wstr, share_utf8;
15371 unsigned int kind;
15372 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015373
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015375
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015376 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015377 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015379 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015380 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015381 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015382 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015383 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015384
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015385 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015386 if (self == NULL) {
15387 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 return NULL;
15389 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015390 kind = PyUnicode_KIND(unicode);
15391 length = PyUnicode_GET_LENGTH(unicode);
15392
15393 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015394#ifdef Py_DEBUG
15395 _PyUnicode_HASH(self) = -1;
15396#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015397 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015398#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015399 _PyUnicode_STATE(self).interned = 0;
15400 _PyUnicode_STATE(self).kind = kind;
15401 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015402 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015403 _PyUnicode_STATE(self).ready = 1;
15404 _PyUnicode_WSTR(self) = NULL;
15405 _PyUnicode_UTF8_LENGTH(self) = 0;
15406 _PyUnicode_UTF8(self) = NULL;
15407 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015408 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015409
15410 share_utf8 = 0;
15411 share_wstr = 0;
15412 if (kind == PyUnicode_1BYTE_KIND) {
15413 char_size = 1;
15414 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15415 share_utf8 = 1;
15416 }
15417 else if (kind == PyUnicode_2BYTE_KIND) {
15418 char_size = 2;
15419 if (sizeof(wchar_t) == 2)
15420 share_wstr = 1;
15421 }
15422 else {
15423 assert(kind == PyUnicode_4BYTE_KIND);
15424 char_size = 4;
15425 if (sizeof(wchar_t) == 4)
15426 share_wstr = 1;
15427 }
15428
15429 /* Ensure we won't overflow the length. */
15430 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15431 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015432 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015434 data = PyObject_MALLOC((length + 1) * char_size);
15435 if (data == NULL) {
15436 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015437 goto onError;
15438 }
15439
Victor Stinnerc3c74152011-10-02 20:39:55 +020015440 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015441 if (share_utf8) {
15442 _PyUnicode_UTF8_LENGTH(self) = length;
15443 _PyUnicode_UTF8(self) = data;
15444 }
15445 if (share_wstr) {
15446 _PyUnicode_WSTR_LENGTH(self) = length;
15447 _PyUnicode_WSTR(self) = (wchar_t *)data;
15448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015449
Christian Heimesf051e432016-09-13 20:22:02 +020015450 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015451 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015452 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015453#ifdef Py_DEBUG
15454 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15455#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015456 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015457 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015458
15459onError:
15460 Py_DECREF(unicode);
15461 Py_DECREF(self);
15462 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015463}
15464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015465PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015466"str(object='') -> str\n\
15467str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015468\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015469Create a new string object from the given object. If encoding or\n\
15470errors is specified, then the object must expose a data buffer\n\
15471that will be decoded using the given encoding and error handler.\n\
15472Otherwise, returns the result of object.__str__() (if defined)\n\
15473or repr(object).\n\
15474encoding defaults to sys.getdefaultencoding().\n\
15475errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015476
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015477static PyObject *unicode_iter(PyObject *seq);
15478
Guido van Rossumd57fd912000-03-10 22:53:23 +000015479PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015480 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015481 "str", /* tp_name */
15482 sizeof(PyUnicodeObject), /* tp_basicsize */
15483 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015484 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015485 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015486 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015487 0, /* tp_getattr */
15488 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015489 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015490 unicode_repr, /* tp_repr */
15491 &unicode_as_number, /* tp_as_number */
15492 &unicode_as_sequence, /* tp_as_sequence */
15493 &unicode_as_mapping, /* tp_as_mapping */
15494 (hashfunc) unicode_hash, /* tp_hash*/
15495 0, /* tp_call*/
15496 (reprfunc) unicode_str, /* tp_str */
15497 PyObject_GenericGetAttr, /* tp_getattro */
15498 0, /* tp_setattro */
15499 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015500 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015501 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15502 unicode_doc, /* tp_doc */
15503 0, /* tp_traverse */
15504 0, /* tp_clear */
15505 PyUnicode_RichCompare, /* tp_richcompare */
15506 0, /* tp_weaklistoffset */
15507 unicode_iter, /* tp_iter */
15508 0, /* tp_iternext */
15509 unicode_methods, /* tp_methods */
15510 0, /* tp_members */
15511 0, /* tp_getset */
15512 &PyBaseObject_Type, /* tp_base */
15513 0, /* tp_dict */
15514 0, /* tp_descr_get */
15515 0, /* tp_descr_set */
15516 0, /* tp_dictoffset */
15517 0, /* tp_init */
15518 0, /* tp_alloc */
15519 unicode_new, /* tp_new */
15520 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015521};
15522
15523/* Initialize the Unicode implementation */
15524
Victor Stinner331a6a52019-05-27 16:39:22 +020015525PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015526_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015527{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015528 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015529 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015530 0x000A, /* LINE FEED */
15531 0x000D, /* CARRIAGE RETURN */
15532 0x001C, /* FILE SEPARATOR */
15533 0x001D, /* GROUP SEPARATOR */
15534 0x001E, /* RECORD SEPARATOR */
15535 0x0085, /* NEXT LINE */
15536 0x2028, /* LINE SEPARATOR */
15537 0x2029, /* PARAGRAPH SEPARATOR */
15538 };
15539
Victor Stinner91698d82020-06-25 14:07:40 +020015540 struct _Py_unicode_state *state = &tstate->interp->unicode;
15541 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015542 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015543 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015544
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015545 if (_Py_IsMainInterpreter(tstate)) {
15546 /* initialize the linebreak bloom filter */
15547 bloom_linebreak = make_bloom_mask(
15548 PyUnicode_2BYTE_KIND, linebreak,
15549 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015550
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015551 if (PyType_Ready(&PyUnicode_Type) < 0) {
15552 return _PyStatus_ERR("Can't initialize unicode type");
15553 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015554
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015555 if (PyType_Ready(&EncodingMapType) < 0) {
15556 return _PyStatus_ERR("Can't initialize encoding map type");
15557 }
15558 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15559 return _PyStatus_ERR("Can't initialize field name iterator type");
15560 }
15561 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15562 return _PyStatus_ERR("Can't initialize formatter iter type");
15563 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015564 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015565 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015566}
15567
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015568
Walter Dörwald16807132007-05-25 13:52:07 +000015569void
15570PyUnicode_InternInPlace(PyObject **p)
15571{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015572 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015573#ifdef Py_DEBUG
15574 assert(s != NULL);
15575 assert(_PyUnicode_CHECK(s));
15576#else
Victor Stinner607b1022020-05-05 18:50:30 +020015577 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015578 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015579 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015580#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015581
Benjamin Peterson14339b62009-01-31 16:36:08 +000015582 /* If it's a subclass, we don't really know what putting
15583 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015584 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015585 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015586 }
15587
15588 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015589 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015590 }
15591
15592#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015593 if (interned == NULL) {
15594 interned = PyDict_New();
15595 if (interned == NULL) {
15596 PyErr_Clear(); /* Don't leave an exception */
15597 return;
15598 }
15599 }
Victor Stinner607b1022020-05-05 18:50:30 +020015600
15601 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015602 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015603 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015604 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015605
Berker Peksagced8d4c2016-07-25 04:40:39 +030015606 if (t == NULL) {
15607 PyErr_Clear();
15608 return;
15609 }
Victor Stinner607b1022020-05-05 18:50:30 +020015610
Berker Peksagced8d4c2016-07-25 04:40:39 +030015611 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015612 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015613 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015614 return;
15615 }
Victor Stinner607b1022020-05-05 18:50:30 +020015616
Benjamin Peterson14339b62009-01-31 16:36:08 +000015617 /* The two references in interned are not counted by refcnt.
15618 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015619 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015620 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015621#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015622}
15623
15624void
15625PyUnicode_InternImmortal(PyObject **p)
15626{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015627 PyUnicode_InternInPlace(p);
15628 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015629 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015630 Py_INCREF(*p);
15631 }
Walter Dörwald16807132007-05-25 13:52:07 +000015632}
15633
15634PyObject *
15635PyUnicode_InternFromString(const char *cp)
15636{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015637 PyObject *s = PyUnicode_FromString(cp);
15638 if (s == NULL)
15639 return NULL;
15640 PyUnicode_InternInPlace(&s);
15641 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015642}
15643
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015644
15645#if defined(WITH_VALGRIND) || defined(__INSURE__)
15646static void
15647unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015648{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015649 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015650 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015651 }
15652 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015653 if (keys == NULL || !PyList_Check(keys)) {
15654 PyErr_Clear();
15655 return;
15656 }
Walter Dörwald16807132007-05-25 13:52:07 +000015657
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015658 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015659 detector, interned unicode strings are not forcibly deallocated;
15660 rather, we give them their stolen references back, and then clear
15661 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015662
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015663 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015664#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015665 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015666
15667 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015668#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015669 for (Py_ssize_t i = 0; i < n; i++) {
15670 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015671 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015672 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015674 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015675 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015676 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015677#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015678 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015679#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015680 break;
15681 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015682 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015683#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015684 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015685#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015686 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015687 case SSTATE_NOT_INTERNED:
15688 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015689 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015690 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015692 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015693 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015694#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015695 fprintf(stderr,
15696 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15697 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015698#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015699 Py_DECREF(keys);
15700 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015701 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015702}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015703#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015704
15705
15706/********************* Unicode Iterator **************************/
15707
15708typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015709 PyObject_HEAD
15710 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015711 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015712} unicodeiterobject;
15713
15714static void
15715unicodeiter_dealloc(unicodeiterobject *it)
15716{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015717 _PyObject_GC_UNTRACK(it);
15718 Py_XDECREF(it->it_seq);
15719 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015720}
15721
15722static int
15723unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15724{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015725 Py_VISIT(it->it_seq);
15726 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015727}
15728
15729static PyObject *
15730unicodeiter_next(unicodeiterobject *it)
15731{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015732 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015733
Benjamin Peterson14339b62009-01-31 16:36:08 +000015734 assert(it != NULL);
15735 seq = it->it_seq;
15736 if (seq == NULL)
15737 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015738 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015740 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15741 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015742 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015743 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15744 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015745 if (item != NULL)
15746 ++it->it_index;
15747 return item;
15748 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015749
Benjamin Peterson14339b62009-01-31 16:36:08 +000015750 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015751 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015752 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015753}
15754
15755static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015756unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015757{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015758 Py_ssize_t len = 0;
15759 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015760 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015761 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015762}
15763
15764PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15765
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015766static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015767unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015768{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015769 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015770 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015771 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015772 it->it_seq, it->it_index);
15773 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015774 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015775 if (u == NULL)
15776 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015777 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015778 }
15779}
15780
15781PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15782
15783static PyObject *
15784unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15785{
15786 Py_ssize_t index = PyLong_AsSsize_t(state);
15787 if (index == -1 && PyErr_Occurred())
15788 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015789 if (it->it_seq != NULL) {
15790 if (index < 0)
15791 index = 0;
15792 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15793 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15794 it->it_index = index;
15795 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015796 Py_RETURN_NONE;
15797}
15798
15799PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15800
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015801static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015802 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015803 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015804 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15805 reduce_doc},
15806 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15807 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015808 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015809};
15810
15811PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015812 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15813 "str_iterator", /* tp_name */
15814 sizeof(unicodeiterobject), /* tp_basicsize */
15815 0, /* tp_itemsize */
15816 /* methods */
15817 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015818 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015819 0, /* tp_getattr */
15820 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015821 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015822 0, /* tp_repr */
15823 0, /* tp_as_number */
15824 0, /* tp_as_sequence */
15825 0, /* tp_as_mapping */
15826 0, /* tp_hash */
15827 0, /* tp_call */
15828 0, /* tp_str */
15829 PyObject_GenericGetAttr, /* tp_getattro */
15830 0, /* tp_setattro */
15831 0, /* tp_as_buffer */
15832 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15833 0, /* tp_doc */
15834 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15835 0, /* tp_clear */
15836 0, /* tp_richcompare */
15837 0, /* tp_weaklistoffset */
15838 PyObject_SelfIter, /* tp_iter */
15839 (iternextfunc)unicodeiter_next, /* tp_iternext */
15840 unicodeiter_methods, /* tp_methods */
15841 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015842};
15843
15844static PyObject *
15845unicode_iter(PyObject *seq)
15846{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015847 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015848
Benjamin Peterson14339b62009-01-31 16:36:08 +000015849 if (!PyUnicode_Check(seq)) {
15850 PyErr_BadInternalCall();
15851 return NULL;
15852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015853 if (PyUnicode_READY(seq) == -1)
15854 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015855 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15856 if (it == NULL)
15857 return NULL;
15858 it->it_index = 0;
15859 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015860 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015861 _PyObject_GC_TRACK(it);
15862 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015863}
15864
Victor Stinner71133ff2010-09-01 23:43:53 +000015865Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015866PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015867{
Victor Stinner577db2c2011-10-11 22:12:48 +020015868 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015869 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015871 if (!PyUnicode_Check(unicode)) {
15872 PyErr_BadArgument();
15873 return NULL;
15874 }
Inada Naoki2c4928d2020-06-17 20:09:44 +090015875_Py_COMP_DIAG_PUSH
15876_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015877 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Inada Naoki2c4928d2020-06-17 20:09:44 +090015878_Py_COMP_DIAG_POP
Victor Stinner577db2c2011-10-11 22:12:48 +020015879 if (u == NULL)
15880 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015881 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015882 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015883 PyErr_NoMemory();
15884 return NULL;
15885 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015886 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015887 size *= sizeof(Py_UNICODE);
15888 copy = PyMem_Malloc(size);
15889 if (copy == NULL) {
15890 PyErr_NoMemory();
15891 return NULL;
15892 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015893 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015894 return copy;
15895}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015896
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015897
Victor Stinner709d23d2019-05-02 14:56:30 -040015898static int
15899encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015900{
Victor Stinner709d23d2019-05-02 14:56:30 -040015901 int res;
15902 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15903 if (res == -2) {
15904 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15905 return -1;
15906 }
15907 if (res < 0) {
15908 PyErr_NoMemory();
15909 return -1;
15910 }
15911 return 0;
15912}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015913
Victor Stinner709d23d2019-05-02 14:56:30 -040015914
15915static int
15916config_get_codec_name(wchar_t **config_encoding)
15917{
15918 char *encoding;
15919 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15920 return -1;
15921 }
15922
15923 PyObject *name_obj = NULL;
15924 PyObject *codec = _PyCodec_Lookup(encoding);
15925 PyMem_RawFree(encoding);
15926
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015927 if (!codec)
15928 goto error;
15929
15930 name_obj = PyObject_GetAttrString(codec, "name");
15931 Py_CLEAR(codec);
15932 if (!name_obj) {
15933 goto error;
15934 }
15935
Victor Stinner709d23d2019-05-02 14:56:30 -040015936 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15937 Py_DECREF(name_obj);
15938 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015939 goto error;
15940 }
15941
Victor Stinner709d23d2019-05-02 14:56:30 -040015942 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15943 if (raw_wname == NULL) {
15944 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015945 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015946 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015947 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015948
15949 PyMem_RawFree(*config_encoding);
15950 *config_encoding = raw_wname;
15951
15952 PyMem_Free(wname);
15953 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015954
15955error:
15956 Py_XDECREF(codec);
15957 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015958 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015959}
15960
15961
Victor Stinner331a6a52019-05-27 16:39:22 +020015962static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015963init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015964{
Victor Stinner709d23d2019-05-02 14:56:30 -040015965 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020015966 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040015967 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015968 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015969 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015970 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015971 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015972}
15973
15974
Victor Stinner709d23d2019-05-02 14:56:30 -040015975static int
15976init_fs_codec(PyInterpreterState *interp)
15977{
Victor Stinnerda7933e2020-04-13 03:04:28 +020015978 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040015979
15980 _Py_error_handler error_handler;
15981 error_handler = get_error_handler_wide(config->filesystem_errors);
15982 if (error_handler == _Py_ERROR_UNKNOWN) {
15983 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15984 return -1;
15985 }
15986
15987 char *encoding, *errors;
15988 if (encode_wstr_utf8(config->filesystem_encoding,
15989 &encoding,
15990 "filesystem_encoding") < 0) {
15991 return -1;
15992 }
15993
15994 if (encode_wstr_utf8(config->filesystem_errors,
15995 &errors,
15996 "filesystem_errors") < 0) {
15997 PyMem_RawFree(encoding);
15998 return -1;
15999 }
16000
Victor Stinner3d17c042020-05-14 01:48:38 +020016001 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16002 PyMem_RawFree(fs_codec->encoding);
16003 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016004 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016005 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16006 PyMem_RawFree(fs_codec->errors);
16007 fs_codec->errors = errors;
16008 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016009
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016010#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016011 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016012#endif
16013
Victor Stinner709d23d2019-05-02 14:56:30 -040016014 /* At this point, PyUnicode_EncodeFSDefault() and
16015 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16016 the C implementation of the filesystem encoding. */
16017
16018 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16019 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016020 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16021 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016022 PyErr_NoMemory();
16023 return -1;
16024 }
16025 return 0;
16026}
16027
16028
Victor Stinner331a6a52019-05-27 16:39:22 +020016029static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016030init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016031{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016032 PyInterpreterState *interp = tstate->interp;
16033
Victor Stinner709d23d2019-05-02 14:56:30 -040016034 /* Update the filesystem encoding to the normalized Python codec name.
16035 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16036 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016037 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016038 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016039 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016040 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016041 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016042 }
16043
Victor Stinner709d23d2019-05-02 14:56:30 -040016044 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016045 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016046 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016047 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016048}
16049
16050
Victor Stinner331a6a52019-05-27 16:39:22 +020016051PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016052_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016053{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016054 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016055 if (_PyStatus_EXCEPTION(status)) {
16056 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016057 }
16058
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016059 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016060}
16061
16062
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016063static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016064_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016065{
Victor Stinner3d17c042020-05-14 01:48:38 +020016066 PyMem_RawFree(fs_codec->encoding);
16067 fs_codec->encoding = NULL;
16068 fs_codec->utf8 = 0;
16069 PyMem_RawFree(fs_codec->errors);
16070 fs_codec->errors = NULL;
16071 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016072}
16073
16074
Victor Stinner709d23d2019-05-02 14:56:30 -040016075#ifdef MS_WINDOWS
16076int
16077_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16078{
Victor Stinner81a7be32020-04-14 15:14:01 +020016079 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016080 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016081
16082 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16083 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16084 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16085 if (encoding == NULL || errors == NULL) {
16086 PyMem_RawFree(encoding);
16087 PyMem_RawFree(errors);
16088 PyErr_NoMemory();
16089 return -1;
16090 }
16091
16092 PyMem_RawFree(config->filesystem_encoding);
16093 config->filesystem_encoding = encoding;
16094 PyMem_RawFree(config->filesystem_errors);
16095 config->filesystem_errors = errors;
16096
16097 return init_fs_codec(interp);
16098}
16099#endif
16100
16101
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016102void
Victor Stinner3d483342019-11-22 12:27:50 +010016103_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016104{
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016105 struct _Py_unicode_state *state = &tstate->interp->unicode;
16106
16107 int is_main_interp = _Py_IsMainInterpreter(tstate);
16108 if (is_main_interp) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016109#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016110 /* Insure++ is a memory analysis tool that aids in discovering
16111 * memory leaks and other memory problems. On Python exit, the
16112 * interned string dictionaries are flagged as being in use at exit
16113 * (which it is). Under normal circumstances, this is fine because
16114 * the memory will be automatically reclaimed by the system. Under
16115 * memory debugging, it's a huge source of useless noise, so we
16116 * trade off slower shutdown for less distraction in the memory
16117 * reports. -baw
16118 */
16119 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016120#endif /* __INSURE__ */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016121 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016122
Victor Stinner91698d82020-06-25 14:07:40 +020016123 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016124
Victor Stinner2f9ada92020-06-24 02:22:21 +020016125 for (Py_ssize_t i = 0; i < 256; i++) {
16126 Py_CLEAR(state->latin1[i]);
16127 }
16128
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016129 if (is_main_interp) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016130 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016131 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016132
Victor Stinner3d17c042020-05-14 01:48:38 +020016133 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016134}
16135
16136
Georg Brandl66c221e2010-10-14 07:04:07 +000016137/* A _string module, to export formatter_parser and formatter_field_name_split
16138 to the string.Formatter class implemented in Python. */
16139
16140static PyMethodDef _string_methods[] = {
16141 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16142 METH_O, PyDoc_STR("split the argument as a field name")},
16143 {"formatter_parser", (PyCFunction) formatter_parser,
16144 METH_O, PyDoc_STR("parse the argument as a format string")},
16145 {NULL, NULL}
16146};
16147
16148static struct PyModuleDef _string_module = {
16149 PyModuleDef_HEAD_INIT,
16150 "_string",
16151 PyDoc_STR("string helper module"),
16152 0,
16153 _string_methods,
16154 NULL,
16155 NULL,
16156 NULL,
16157 NULL
16158};
16159
16160PyMODINIT_FUNC
16161PyInit__string(void)
16162{
16163 return PyModule_Create(&_string_module);
16164}
16165
16166
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016167#ifdef __cplusplus
16168}
16169#endif