blob: 70688c8c013816f6d779243762f37bd3db2b7045 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
44#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
46#include "pycore_interp.h" // PyInterpreterState.fs_codec
47#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
50#include "pycore_pystate.h" // _PyInterpreterState_GET()
51#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinner666ecfb2020-07-02 01:19:57 +020058/* Uncomment to display statistics on interned strings at exit
59 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010060/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100842 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 if (size < 0) {
844 return NULL;
845 }
846 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200847 }
848 return str;
849}
850
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851/* --- Bloom Filters ----------------------------------------------------- */
852
853/* stuff to implement simple "bloom filters" for Unicode characters.
854 to keep things simple, we use a single bitmask, using the least 5
855 bits from each unicode characters as the bit index. */
856
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200857/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000858
Antoine Pitrouf068f942010-01-13 14:19:12 +0000859#if LONG_BIT >= 128
860#define BLOOM_WIDTH 128
861#elif LONG_BIT >= 64
862#define BLOOM_WIDTH 64
863#elif LONG_BIT >= 32
864#define BLOOM_WIDTH 32
865#else
866#error "LONG_BIT is smaller than 32"
867#endif
868
Thomas Wouters477c8d52006-05-27 19:21:47 +0000869#define BLOOM_MASK unsigned long
870
Serhiy Storchaka05997252013-01-26 12:14:02 +0200871static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872
Antoine Pitrouf068f942010-01-13 14:19:12 +0000873#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson29060642009-01-31 22:14:21 +0000875#define BLOOM_LINEBREAK(ch) \
876 ((ch) < 128U ? ascii_linebreak[(ch)] : \
877 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000878
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700879static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300880make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881{
Victor Stinnera85af502013-04-09 21:53:54 +0200882#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
883 do { \
884 TYPE *data = (TYPE *)PTR; \
885 TYPE *end = data + LEN; \
886 Py_UCS4 ch; \
887 for (; data != end; data++) { \
888 ch = *data; \
889 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
890 } \
891 break; \
892 } while (0)
893
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894 /* calculate simple bloom-style bitmask for a given unicode string */
895
Antoine Pitrouf068f942010-01-13 14:19:12 +0000896 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897
898 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200899 switch (kind) {
900 case PyUnicode_1BYTE_KIND:
901 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
902 break;
903 case PyUnicode_2BYTE_KIND:
904 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
905 break;
906 case PyUnicode_4BYTE_KIND:
907 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
908 break;
909 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700910 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200911 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000912 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200913
914#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000915}
916
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300917static int
918ensure_unicode(PyObject *obj)
919{
920 if (!PyUnicode_Check(obj)) {
921 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200922 "must be str, not %.100s",
923 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300924 return -1;
925 }
926 return PyUnicode_READY(obj);
927}
928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929/* Compilation of templated routines */
930
Victor Stinner90ed8a62020-06-24 00:34:07 +0200931#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200932
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/asciilib.h"
934#include "stringlib/fastsearch.h"
935#include "stringlib/partition.h"
936#include "stringlib/split.h"
937#include "stringlib/count.h"
938#include "stringlib/find.h"
939#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200940#include "stringlib/undef.h"
941
942#include "stringlib/ucs1lib.h"
943#include "stringlib/fastsearch.h"
944#include "stringlib/partition.h"
945#include "stringlib/split.h"
946#include "stringlib/count.h"
947#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300948#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200949#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/undef.h"
951
952#include "stringlib/ucs2lib.h"
953#include "stringlib/fastsearch.h"
954#include "stringlib/partition.h"
955#include "stringlib/split.h"
956#include "stringlib/count.h"
957#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300958#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200959#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/undef.h"
961
962#include "stringlib/ucs4lib.h"
963#include "stringlib/fastsearch.h"
964#include "stringlib/partition.h"
965#include "stringlib/split.h"
966#include "stringlib/count.h"
967#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300968#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200969#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/undef.h"
971
Inada Naoki2c4928d2020-06-17 20:09:44 +0900972_Py_COMP_DIAG_PUSH
973_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200974#include "stringlib/unicodedefs.h"
975#include "stringlib/fastsearch.h"
976#include "stringlib/count.h"
977#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100978#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900979_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200980
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200981#undef STRINGLIB_GET_EMPTY
982
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983/* --- Unicode Object ----------------------------------------------------- */
984
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700985static inline Py_ssize_t
986findchar(const void *s, int kind,
987 Py_ssize_t size, Py_UCS4 ch,
988 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200990 switch (kind) {
991 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 if ((Py_UCS1) ch != ch)
993 return -1;
994 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600995 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200996 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600997 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200998 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 if ((Py_UCS2) ch != ch)
1000 return -1;
1001 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001002 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001003 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001004 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001005 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001006 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001007 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001008 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001009 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001010 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001011 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001013}
1014
Victor Stinnerafffce42012-10-03 23:03:17 +02001015#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001016/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001017 earlier.
1018
1019 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1020 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1021 invalid character in Unicode 6.0. */
1022static void
1023unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1024{
1025 int kind = PyUnicode_KIND(unicode);
1026 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1027 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1028 if (length <= old_length)
1029 return;
1030 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1031}
1032#endif
1033
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034static PyObject*
1035resize_compact(PyObject *unicode, Py_ssize_t length)
1036{
1037 Py_ssize_t char_size;
1038 Py_ssize_t struct_size;
1039 Py_ssize_t new_size;
1040 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001041 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001042#ifdef Py_DEBUG
1043 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1044#endif
1045
Victor Stinner79891572012-05-03 13:43:07 +02001046 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001048 assert(PyUnicode_IS_COMPACT(unicode));
1049
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001050 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001051 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052 struct_size = sizeof(PyASCIIObject);
1053 else
1054 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001055 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1058 PyErr_NoMemory();
1059 return NULL;
1060 }
1061 new_size = (struct_size + (length + 1) * char_size);
1062
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001063 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1064 PyObject_DEL(_PyUnicode_UTF8(unicode));
1065 _PyUnicode_UTF8(unicode) = NULL;
1066 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1067 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001068#ifdef Py_REF_DEBUG
1069 _Py_RefTotal--;
1070#endif
1071#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001072 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001073#endif
Victor Stinner84def372011-12-11 20:04:56 +01001074
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001075 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001076 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001077 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 PyErr_NoMemory();
1079 return NULL;
1080 }
Victor Stinner84def372011-12-11 20:04:56 +01001081 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001083
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001085 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001086 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001087 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001088 _PyUnicode_WSTR_LENGTH(unicode) = length;
1089 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001090 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1091 PyObject_DEL(_PyUnicode_WSTR(unicode));
1092 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001093 if (!PyUnicode_IS_ASCII(unicode))
1094 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001095 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001096#ifdef Py_DEBUG
1097 unicode_fill_invalid(unicode, old_length);
1098#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1100 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001101 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001102 return unicode;
1103}
1104
Alexander Belopolsky40018472011-02-26 01:02:56 +00001105static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001106resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001107{
Victor Stinner95663112011-10-04 01:03:50 +02001108 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001109 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001112
Victor Stinnerfe226c02011-10-03 03:52:20 +02001113 if (PyUnicode_IS_READY(unicode)) {
1114 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001115 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001117#ifdef Py_DEBUG
1118 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1119#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001120
1121 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001122 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1124 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125
1126 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1127 PyErr_NoMemory();
1128 return -1;
1129 }
1130 new_size = (length + 1) * char_size;
1131
Victor Stinner7a9105a2011-12-12 00:13:42 +01001132 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1133 {
1134 PyObject_DEL(_PyUnicode_UTF8(unicode));
1135 _PyUnicode_UTF8(unicode) = NULL;
1136 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1137 }
1138
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139 data = (PyObject *)PyObject_REALLOC(data, new_size);
1140 if (data == NULL) {
1141 PyErr_NoMemory();
1142 return -1;
1143 }
1144 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001145 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001146 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_WSTR_LENGTH(unicode) = length;
1148 }
1149 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001150 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001151 _PyUnicode_UTF8_LENGTH(unicode) = length;
1152 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001153 _PyUnicode_LENGTH(unicode) = length;
1154 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001155#ifdef Py_DEBUG
1156 unicode_fill_invalid(unicode, old_length);
1157#endif
Victor Stinner95663112011-10-04 01:03:50 +02001158 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001159 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001162 }
Victor Stinner95663112011-10-04 01:03:50 +02001163 assert(_PyUnicode_WSTR(unicode) != NULL);
1164
1165 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001166 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001170 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001171 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001172 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001173 if (!wstr) {
1174 PyErr_NoMemory();
1175 return -1;
1176 }
1177 _PyUnicode_WSTR(unicode) = wstr;
1178 _PyUnicode_WSTR(unicode)[length] = 0;
1179 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001180 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 return 0;
1182}
1183
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184static PyObject*
1185resize_copy(PyObject *unicode, Py_ssize_t length)
1186{
1187 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001188 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001189 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001190
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001191 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001192
1193 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1194 if (copy == NULL)
1195 return NULL;
1196
1197 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001198 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001199 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001200 }
1201 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001202 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001203
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001204 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001205 if (w == NULL)
1206 return NULL;
1207 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1208 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001209 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001210 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001211 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001212 }
1213}
1214
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001216 Ux0000 terminated; some code (e.g. new_identifier)
1217 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218
1219 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001220 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221
1222*/
1223
Alexander Belopolsky40018472011-02-26 01:02:56 +00001224static PyUnicodeObject *
1225_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001227 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229
Thomas Wouters477c8d52006-05-27 19:21:47 +00001230 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001231 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001232 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 }
1234
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001235 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001236 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001237 return (PyUnicodeObject *)PyErr_NoMemory();
1238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239 if (length < 0) {
1240 PyErr_SetString(PyExc_SystemError,
1241 "Negative size passed to _PyUnicode_New");
1242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1246 if (unicode == NULL)
1247 return NULL;
1248 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001249
1250 _PyUnicode_WSTR_LENGTH(unicode) = length;
1251 _PyUnicode_HASH(unicode) = -1;
1252 _PyUnicode_STATE(unicode).interned = 0;
1253 _PyUnicode_STATE(unicode).kind = 0;
1254 _PyUnicode_STATE(unicode).compact = 0;
1255 _PyUnicode_STATE(unicode).ready = 0;
1256 _PyUnicode_STATE(unicode).ascii = 0;
1257 _PyUnicode_DATA_ANY(unicode) = NULL;
1258 _PyUnicode_LENGTH(unicode) = 0;
1259 _PyUnicode_UTF8(unicode) = NULL;
1260 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1263 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001264 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001265 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001266 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268
Jeremy Hyltond8082792003-09-16 19:41:39 +00001269 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001270 * the caller fails before initializing str -- unicode_resize()
1271 * reads str[0], and the Keep-Alive optimization can keep memory
1272 * allocated for str alive across a call to unicode_dealloc(unicode).
1273 * We don't want unicode_resize to read uninitialized memory in
1274 * that case.
1275 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276 _PyUnicode_WSTR(unicode)[0] = 0;
1277 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001278
Victor Stinner7931d9a2011-11-04 00:22:48 +01001279 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 return unicode;
1281}
1282
Victor Stinnerf42dc442011-10-02 23:33:16 +02001283static const char*
1284unicode_kind_name(PyObject *unicode)
1285{
Victor Stinner42dfd712011-10-03 14:41:45 +02001286 /* don't check consistency: unicode_kind_name() is called from
1287 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001288 if (!PyUnicode_IS_COMPACT(unicode))
1289 {
1290 if (!PyUnicode_IS_READY(unicode))
1291 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001292 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 {
1294 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001295 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001296 return "legacy ascii";
1297 else
1298 return "legacy latin1";
1299 case PyUnicode_2BYTE_KIND:
1300 return "legacy UCS2";
1301 case PyUnicode_4BYTE_KIND:
1302 return "legacy UCS4";
1303 default:
1304 return "<legacy invalid kind>";
1305 }
1306 }
1307 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001308 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001309 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001310 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001311 return "ascii";
1312 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001315 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001316 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001317 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001318 default:
1319 return "<invalid compact kind>";
1320 }
1321}
1322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001325const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001326 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001327 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328}
1329
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 return _PyUnicode_COMPACT_DATA(unicode);
1333}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001334const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001335 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001336 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1338 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1339 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1340 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1341 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1342 return PyUnicode_DATA(unicode);
1343}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001344
1345void
1346_PyUnicode_Dump(PyObject *op)
1347{
1348 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1350 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001351 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001352
Victor Stinnera849a4b2011-10-03 12:12:11 +02001353 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001354 {
1355 if (ascii->state.ascii)
1356 data = (ascii + 1);
1357 else
1358 data = (compact + 1);
1359 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 else
1361 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001362 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera849a4b2011-10-03 12:12:11 +02001364 if (ascii->wstr == data)
1365 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001366 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001367
Victor Stinnera3b334d2011-10-03 13:53:37 +02001368 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001369 printf(" (%zu), ", compact->wstr_length);
1370 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001372 }
1373 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001374 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001375 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001376}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377#endif
1378
Victor Stinner91698d82020-06-25 14:07:40 +02001379static int
1380unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1381{
1382 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1383 // optimized to always use state->empty_string without having to check if
1384 // it is NULL or not.
1385 PyObject *empty = PyUnicode_New(1, 0);
1386 if (empty == NULL) {
1387 return -1;
1388 }
1389 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1390 _PyUnicode_LENGTH(empty) = 0;
1391 assert(_PyUnicode_CheckConsistency(empty, 1));
1392
1393 assert(state->empty_string == NULL);
1394 state->empty_string = empty;
1395 return 0;
1396}
1397
1398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399PyObject *
1400PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1401{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001402 /* Optimization for empty strings */
1403 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001404 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001405 }
1406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 PyObject *obj;
1408 PyCompactUnicodeObject *unicode;
1409 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001410 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 Py_ssize_t char_size;
1413 Py_ssize_t struct_size;
1414
Victor Stinner9e9d6892011-10-04 01:02:02 +02001415 is_ascii = 0;
1416 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 struct_size = sizeof(PyCompactUnicodeObject);
1418 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001419 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 char_size = 1;
1421 is_ascii = 1;
1422 struct_size = sizeof(PyASCIIObject);
1423 }
1424 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 1;
1427 }
1428 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001429 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 char_size = 2;
1431 if (sizeof(wchar_t) == 2)
1432 is_sharing = 1;
1433 }
1434 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001435 if (maxchar > MAX_UNICODE) {
1436 PyErr_SetString(PyExc_SystemError,
1437 "invalid maximum character passed to PyUnicode_New");
1438 return NULL;
1439 }
Victor Stinner8f825062012-04-27 13:55:39 +02001440 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 char_size = 4;
1442 if (sizeof(wchar_t) == 4)
1443 is_sharing = 1;
1444 }
1445
1446 /* Ensure we won't overflow the size. */
1447 if (size < 0) {
1448 PyErr_SetString(PyExc_SystemError,
1449 "Negative size passed to PyUnicode_New");
1450 return NULL;
1451 }
1452 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1453 return PyErr_NoMemory();
1454
1455 /* Duplicated allocation code from _PyObject_New() instead of a call to
1456 * PyObject_New() so we are able to allocate space for the object and
1457 * it's data buffer.
1458 */
1459 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001460 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001462 }
1463 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464
1465 unicode = (PyCompactUnicodeObject *)obj;
1466 if (is_ascii)
1467 data = ((PyASCIIObject*)obj) + 1;
1468 else
1469 data = unicode + 1;
1470 _PyUnicode_LENGTH(unicode) = size;
1471 _PyUnicode_HASH(unicode) = -1;
1472 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001473 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 _PyUnicode_STATE(unicode).compact = 1;
1475 _PyUnicode_STATE(unicode).ready = 1;
1476 _PyUnicode_STATE(unicode).ascii = is_ascii;
1477 if (is_ascii) {
1478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 }
Victor Stinner8f825062012-04-27 13:55:39 +02001481 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 ((char*)data)[size] = 0;
1483 _PyUnicode_WSTR(unicode) = NULL;
1484 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 else {
1489 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001490 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001491 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001493 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 ((Py_UCS4*)data)[size] = 0;
1495 if (is_sharing) {
1496 _PyUnicode_WSTR_LENGTH(unicode) = size;
1497 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1498 }
1499 else {
1500 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1501 _PyUnicode_WSTR(unicode) = NULL;
1502 }
1503 }
Victor Stinner8f825062012-04-27 13:55:39 +02001504#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001505 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001506#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001507 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001508 return obj;
1509}
1510
1511#if SIZEOF_WCHAR_T == 2
1512/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1513 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001514 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515
1516 This function assumes that unicode can hold one more code point than wstr
1517 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001518static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001520 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521{
1522 const wchar_t *iter;
1523 Py_UCS4 *ucs4_out;
1524
Victor Stinner910337b2011-10-03 03:20:16 +02001525 assert(unicode != NULL);
1526 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1528 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1529
1530 for (iter = begin; iter < end; ) {
1531 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1532 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001533 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1534 && (iter+1) < end
1535 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 {
Victor Stinner551ac952011-11-29 22:58:13 +01001537 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 iter += 2;
1539 }
1540 else {
1541 *ucs4_out++ = *iter;
1542 iter++;
1543 }
1544 }
1545 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1546 _PyUnicode_GET_LENGTH(unicode)));
1547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548}
1549#endif
1550
Victor Stinnercd9950f2011-10-02 00:34:53 +02001551static int
Victor Stinner488fa492011-12-12 00:01:39 +01001552unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553{
Victor Stinner488fa492011-12-12 00:01:39 +01001554 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001555 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001556 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001557 return -1;
1558 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001559 return 0;
1560}
1561
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562static int
1563_copy_characters(PyObject *to, Py_ssize_t to_start,
1564 PyObject *from, Py_ssize_t from_start,
1565 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001567 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001568 const void *from_data;
1569 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570
Victor Stinneree4544c2012-05-09 22:24:08 +02001571 assert(0 <= how_many);
1572 assert(0 <= from_start);
1573 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001574 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001575 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001576 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577
Victor Stinnerd3f08822012-05-29 12:57:52 +02001578 assert(PyUnicode_Check(to));
1579 assert(PyUnicode_IS_READY(to));
1580 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1581
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001582 if (how_many == 0)
1583 return 0;
1584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001586 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001588 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589
Victor Stinnerf1852262012-06-16 16:38:26 +02001590#ifdef Py_DEBUG
1591 if (!check_maxchar
1592 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1593 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001594 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001595 Py_UCS4 ch;
1596 Py_ssize_t i;
1597 for (i=0; i < how_many; i++) {
1598 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1599 assert(ch <= to_maxchar);
1600 }
1601 }
1602#endif
1603
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001605 if (check_maxchar
1606 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1607 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001608 /* Writing Latin-1 characters into an ASCII string requires to
1609 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001610 Py_UCS4 max_char;
1611 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001612 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001613 if (max_char >= 128)
1614 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001615 }
Christian Heimesf051e432016-09-13 20:22:02 +02001616 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001617 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001618 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001620 else if (from_kind == PyUnicode_1BYTE_KIND
1621 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001622 {
1623 _PyUnicode_CONVERT_BYTES(
1624 Py_UCS1, Py_UCS2,
1625 PyUnicode_1BYTE_DATA(from) + from_start,
1626 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1627 PyUnicode_2BYTE_DATA(to) + to_start
1628 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001629 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001630 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001631 && to_kind == PyUnicode_4BYTE_KIND)
1632 {
1633 _PyUnicode_CONVERT_BYTES(
1634 Py_UCS1, Py_UCS4,
1635 PyUnicode_1BYTE_DATA(from) + from_start,
1636 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1637 PyUnicode_4BYTE_DATA(to) + to_start
1638 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001639 }
1640 else if (from_kind == PyUnicode_2BYTE_KIND
1641 && to_kind == PyUnicode_4BYTE_KIND)
1642 {
1643 _PyUnicode_CONVERT_BYTES(
1644 Py_UCS2, Py_UCS4,
1645 PyUnicode_2BYTE_DATA(from) + from_start,
1646 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1647 PyUnicode_4BYTE_DATA(to) + to_start
1648 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001649 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001650 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001651 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1652
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001653 if (!check_maxchar) {
1654 if (from_kind == PyUnicode_2BYTE_KIND
1655 && to_kind == PyUnicode_1BYTE_KIND)
1656 {
1657 _PyUnicode_CONVERT_BYTES(
1658 Py_UCS2, Py_UCS1,
1659 PyUnicode_2BYTE_DATA(from) + from_start,
1660 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1661 PyUnicode_1BYTE_DATA(to) + to_start
1662 );
1663 }
1664 else if (from_kind == PyUnicode_4BYTE_KIND
1665 && to_kind == PyUnicode_1BYTE_KIND)
1666 {
1667 _PyUnicode_CONVERT_BYTES(
1668 Py_UCS4, Py_UCS1,
1669 PyUnicode_4BYTE_DATA(from) + from_start,
1670 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1671 PyUnicode_1BYTE_DATA(to) + to_start
1672 );
1673 }
1674 else if (from_kind == PyUnicode_4BYTE_KIND
1675 && to_kind == PyUnicode_2BYTE_KIND)
1676 {
1677 _PyUnicode_CONVERT_BYTES(
1678 Py_UCS4, Py_UCS2,
1679 PyUnicode_4BYTE_DATA(from) + from_start,
1680 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1681 PyUnicode_2BYTE_DATA(to) + to_start
1682 );
1683 }
1684 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001685 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001686 }
1687 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001688 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001690 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001691 Py_ssize_t i;
1692
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 for (i=0; i < how_many; i++) {
1694 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001695 if (ch > to_maxchar)
1696 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001697 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1698 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001699 }
1700 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001701 return 0;
1702}
1703
Victor Stinnerd3f08822012-05-29 12:57:52 +02001704void
1705_PyUnicode_FastCopyCharacters(
1706 PyObject *to, Py_ssize_t to_start,
1707 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001708{
1709 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1710}
1711
1712Py_ssize_t
1713PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1714 PyObject *from, Py_ssize_t from_start,
1715 Py_ssize_t how_many)
1716{
1717 int err;
1718
1719 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1720 PyErr_BadInternalCall();
1721 return -1;
1722 }
1723
Benjamin Petersonbac79492012-01-14 13:34:47 -05001724 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001725 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001726 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001727 return -1;
1728
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001734 PyErr_SetString(PyExc_IndexError, "string index out of range");
1735 return -1;
1736 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001737 if (how_many < 0) {
1738 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1739 return -1;
1740 }
1741 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1743 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001744 "Cannot write %zi characters at %zi "
1745 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001746 how_many, to_start, PyUnicode_GET_LENGTH(to));
1747 return -1;
1748 }
1749
1750 if (how_many == 0)
1751 return 0;
1752
Victor Stinner488fa492011-12-12 00:01:39 +01001753 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001754 return -1;
1755
1756 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1757 if (err) {
1758 PyErr_Format(PyExc_SystemError,
1759 "Cannot copy %s characters "
1760 "into a string of %s characters",
1761 unicode_kind_name(from),
1762 unicode_kind_name(to));
1763 return -1;
1764 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001765 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766}
1767
Victor Stinner17222162011-09-28 22:15:37 +02001768/* Find the maximum code point and count the number of surrogate pairs so a
1769 correct string length can be computed before converting a string to UCS4.
1770 This function counts single surrogates as a character and not as a pair.
1771
1772 Return 0 on success, or -1 on error. */
1773static int
1774find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1775 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776{
1777 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001778 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779
Victor Stinnerc53be962011-10-02 21:33:54 +02001780 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 *num_surrogates = 0;
1782 *maxchar = 0;
1783
1784 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001786 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1787 && (iter+1) < end
1788 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1789 {
1790 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1791 ++(*num_surrogates);
1792 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 }
1794 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001796 {
1797 ch = *iter;
1798 iter++;
1799 }
1800 if (ch > *maxchar) {
1801 *maxchar = ch;
1802 if (*maxchar > MAX_UNICODE) {
1803 PyErr_Format(PyExc_ValueError,
1804 "character U+%x is not in range [U+0000; U+10ffff]",
1805 ch);
1806 return -1;
1807 }
1808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 }
1810 return 0;
1811}
1812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813int
1814_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815{
1816 wchar_t *end;
1817 Py_UCS4 maxchar = 0;
1818 Py_ssize_t num_surrogates;
1819#if SIZEOF_WCHAR_T == 2
1820 Py_ssize_t length_wo_surrogates;
1821#endif
1822
Georg Brandl7597add2011-10-05 16:36:47 +02001823 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001824 strings were created using _PyObject_New() and where no canonical
1825 representation (the str field) has been set yet aka strings
1826 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001827 assert(_PyUnicode_CHECK(unicode));
1828 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001830 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001832 /* Actually, it should neither be interned nor be anything else: */
1833 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001836 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001837 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839
1840 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001841 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1842 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 PyErr_NoMemory();
1844 return -1;
1845 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001846 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 _PyUnicode_WSTR(unicode), end,
1848 PyUnicode_1BYTE_DATA(unicode));
1849 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1850 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1851 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1852 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001853 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001854 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 }
1857 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001858 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001859 _PyUnicode_UTF8(unicode) = NULL;
1860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 }
1862 PyObject_FREE(_PyUnicode_WSTR(unicode));
1863 _PyUnicode_WSTR(unicode) = NULL;
1864 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1865 }
1866 /* In this case we might have to convert down from 4-byte native
1867 wchar_t to 2-byte unicode. */
1868 else if (maxchar < 65536) {
1869 assert(num_surrogates == 0 &&
1870 "FindMaxCharAndNumSurrogatePairs() messed up");
1871
Victor Stinner506f5922011-09-28 22:34:18 +02001872#if SIZEOF_WCHAR_T == 2
1873 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001874 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001875 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1876 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1877 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001878 _PyUnicode_UTF8(unicode) = NULL;
1879 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001880#else
1881 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001882 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001883 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001884 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001885 PyErr_NoMemory();
1886 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 }
Victor Stinner506f5922011-09-28 22:34:18 +02001888 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1889 _PyUnicode_WSTR(unicode), end,
1890 PyUnicode_2BYTE_DATA(unicode));
1891 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1892 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1893 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001894 _PyUnicode_UTF8(unicode) = NULL;
1895 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001896 PyObject_FREE(_PyUnicode_WSTR(unicode));
1897 _PyUnicode_WSTR(unicode) = NULL;
1898 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1899#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001901 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 else {
1903#if SIZEOF_WCHAR_T == 2
1904 /* in case the native representation is 2-bytes, we need to allocate a
1905 new normalized 4-byte version. */
1906 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001907 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1908 PyErr_NoMemory();
1909 return -1;
1910 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001911 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1912 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 PyErr_NoMemory();
1914 return -1;
1915 }
1916 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1917 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001918 _PyUnicode_UTF8(unicode) = NULL;
1919 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001920 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1921 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001922 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 PyObject_FREE(_PyUnicode_WSTR(unicode));
1924 _PyUnicode_WSTR(unicode) = NULL;
1925 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1926#else
1927 assert(num_surrogates == 0);
1928
Victor Stinnerc3c74152011-10-02 20:39:55 +02001929 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001931 _PyUnicode_UTF8(unicode) = NULL;
1932 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001933 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1934#endif
1935 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1936 }
1937 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001938 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 return 0;
1940}
1941
Alexander Belopolsky40018472011-02-26 01:02:56 +00001942static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001943unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001944{
Walter Dörwald16807132007-05-25 13:52:07 +00001945 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001946 case SSTATE_NOT_INTERNED:
1947 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001948
Benjamin Peterson29060642009-01-31 22:14:21 +00001949 case SSTATE_INTERNED_MORTAL:
Victor Stinner607b1022020-05-05 18:50:30 +02001950#ifdef INTERNED_STRINGS
Victor Stinner3549ca32020-07-03 16:59:12 +02001951 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1952 references (key and value) which were ignored by
1953 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1954 to prevent calling unicode_dealloc() again. Adjust refcnt after
1955 PyDict_DelItem(). */
1956 assert(Py_REFCNT(unicode) == 0);
1957 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001958 if (PyDict_DelItem(interned, unicode) != 0) {
1959 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1960 NULL);
1961 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001962 assert(Py_REFCNT(unicode) == 1);
1963 Py_SET_REFCNT(unicode, 0);
Victor Stinner607b1022020-05-05 18:50:30 +02001964#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001965 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001966
Benjamin Peterson29060642009-01-31 22:14:21 +00001967 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001968 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1969 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001970
Benjamin Peterson29060642009-01-31 22:14:21 +00001971 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001973 }
1974
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001975 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001977 }
1978 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001979 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001980 }
1981 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001982 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001985 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986}
1987
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001988#ifdef Py_DEBUG
1989static int
1990unicode_is_singleton(PyObject *unicode)
1991{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001992 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001993 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001994 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001995 }
Victor Stinner607b1022020-05-05 18:50:30 +02001996 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001997 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1998 {
1999 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002000 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002001 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02002002 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002003 }
2004 return 0;
2005}
2006#endif
2007
Alexander Belopolsky40018472011-02-26 01:02:56 +00002008static int
Victor Stinner488fa492011-12-12 00:01:39 +01002009unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002010{
Victor Stinner488fa492011-12-12 00:01:39 +01002011 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012 if (Py_REFCNT(unicode) != 1)
2013 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002014 if (_PyUnicode_HASH(unicode) != -1)
2015 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016 if (PyUnicode_CHECK_INTERNED(unicode))
2017 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002018 if (!PyUnicode_CheckExact(unicode))
2019 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002020#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002021 /* singleton refcount is greater than 1 */
2022 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002023#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002024 return 1;
2025}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002026
Victor Stinnerfe226c02011-10-03 03:52:20 +02002027static int
2028unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2029{
2030 PyObject *unicode;
2031 Py_ssize_t old_length;
2032
2033 assert(p_unicode != NULL);
2034 unicode = *p_unicode;
2035
2036 assert(unicode != NULL);
2037 assert(PyUnicode_Check(unicode));
2038 assert(0 <= length);
2039
Victor Stinner910337b2011-10-03 03:20:16 +02002040 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002041 old_length = PyUnicode_WSTR_LENGTH(unicode);
2042 else
2043 old_length = PyUnicode_GET_LENGTH(unicode);
2044 if (old_length == length)
2045 return 0;
2046
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002047 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002048 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002049 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002050 return 0;
2051 }
2052
Victor Stinner488fa492011-12-12 00:01:39 +01002053 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002054 PyObject *copy = resize_copy(unicode, length);
2055 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002057 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002059 }
2060
Victor Stinnerfe226c02011-10-03 03:52:20 +02002061 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002062 PyObject *new_unicode = resize_compact(unicode, length);
2063 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002064 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002065 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002066 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002067 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002068 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002069}
2070
Alexander Belopolsky40018472011-02-26 01:02:56 +00002071int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002072PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002073{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002074 PyObject *unicode;
2075 if (p_unicode == NULL) {
2076 PyErr_BadInternalCall();
2077 return -1;
2078 }
2079 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002080 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002081 {
2082 PyErr_BadInternalCall();
2083 return -1;
2084 }
2085 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002086}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002087
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002088/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002089
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002090 WARNING: The function doesn't copy the terminating null character and
2091 doesn't check the maximum character (may write a latin1 character in an
2092 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002093static void
2094unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2095 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002096{
2097 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002098 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002099 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002100
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002101 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002102 switch (kind) {
2103 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002104#ifdef Py_DEBUG
2105 if (PyUnicode_IS_ASCII(unicode)) {
2106 Py_UCS4 maxchar = ucs1lib_find_max_char(
2107 (const Py_UCS1*)str,
2108 (const Py_UCS1*)str + len);
2109 assert(maxchar < 128);
2110 }
2111#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002112 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002113 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002114 }
2115 case PyUnicode_2BYTE_KIND: {
2116 Py_UCS2 *start = (Py_UCS2 *)data + index;
2117 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002118
Victor Stinner184252a2012-06-16 02:57:41 +02002119 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002120 *ucs2 = (Py_UCS2)*str;
2121
2122 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002123 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002124 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002125 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 Py_UCS4 *start = (Py_UCS4 *)data + index;
2127 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002128
Victor Stinner184252a2012-06-16 02:57:41 +02002129 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002130 *ucs4 = (Py_UCS4)*str;
2131
2132 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002133 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002134 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002135 default:
2136 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002137 }
2138}
2139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002141get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002143 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002144
Victor Stinner2f9ada92020-06-24 02:22:21 +02002145 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002146 if (unicode) {
2147 Py_INCREF(unicode);
2148 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 }
Victor Stinner607b1022020-05-05 18:50:30 +02002150
2151 unicode = PyUnicode_New(1, ch);
2152 if (!unicode) {
2153 return NULL;
2154 }
2155
2156 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2157 assert(_PyUnicode_CheckConsistency(unicode, 1));
2158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002160 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002161 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162}
2163
Victor Stinner985a82a2014-01-03 12:53:47 +01002164static PyObject*
2165unicode_char(Py_UCS4 ch)
2166{
2167 PyObject *unicode;
2168
2169 assert(ch <= MAX_UNICODE);
2170
Victor Stinner2f9ada92020-06-24 02:22:21 +02002171 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002172 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002173 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002174
Victor Stinner985a82a2014-01-03 12:53:47 +01002175 unicode = PyUnicode_New(1, ch);
2176 if (unicode == NULL)
2177 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002178
2179 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2180 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002181 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002182 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002183 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2184 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2185 }
2186 assert(_PyUnicode_CheckConsistency(unicode, 1));
2187 return unicode;
2188}
2189
Alexander Belopolsky40018472011-02-26 01:02:56 +00002190PyObject *
2191PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002193 if (u == NULL) {
2194 if (size > 0) {
2195 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2196 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2197 "use PyUnicode_New() instead", 1) < 0) {
2198 return NULL;
2199 }
2200 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002201 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002202 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002203
2204 if (size < 0) {
2205 PyErr_BadInternalCall();
2206 return NULL;
2207 }
2208
2209 return PyUnicode_FromWideChar(u, size);
2210}
2211
2212PyObject *
2213PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2214{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002215 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 Py_UCS4 maxchar = 0;
2217 Py_ssize_t num_surrogates;
2218
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002219 if (u == NULL && size != 0) {
2220 PyErr_BadInternalCall();
2221 return NULL;
2222 }
2223
2224 if (size == -1) {
2225 size = wcslen(u);
2226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002227
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002228 /* If the Unicode data is known at construction time, we can apply
2229 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002232 if (size == 0)
2233 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 /* Single character Unicode objects in the Latin-1 range are
2236 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002237 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 return get_latin1_char((unsigned char)*u);
2239
2240 /* If not empty and not single character, copy the Unicode data
2241 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002242 if (find_maxchar_surrogates(u, u + size,
2243 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 return NULL;
2245
Victor Stinner8faf8212011-12-08 22:14:11 +01002246 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 if (!unicode)
2248 return NULL;
2249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 switch (PyUnicode_KIND(unicode)) {
2251 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002252 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2254 break;
2255 case PyUnicode_2BYTE_KIND:
2256#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002257 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002259 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2261#endif
2262 break;
2263 case PyUnicode_4BYTE_KIND:
2264#if SIZEOF_WCHAR_T == 2
2265 /* This is the only case which has to process surrogates, thus
2266 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002267 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268#else
2269 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002270 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271#endif
2272 break;
2273 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002274 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002277 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002278}
2279
Alexander Belopolsky40018472011-02-26 01:02:56 +00002280PyObject *
2281PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002282{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002283 if (size < 0) {
2284 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002285 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002286 return NULL;
2287 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002288 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002289 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002290 }
2291 else {
2292 if (size > 0) {
2293 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2294 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2295 "use PyUnicode_New() instead", 1) < 0) {
2296 return NULL;
2297 }
2298 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002299 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002300 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002301}
2302
Alexander Belopolsky40018472011-02-26 01:02:56 +00002303PyObject *
2304PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002305{
2306 size_t size = strlen(u);
2307 if (size > PY_SSIZE_T_MAX) {
2308 PyErr_SetString(PyExc_OverflowError, "input too long");
2309 return NULL;
2310 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002311 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002312}
2313
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002314PyObject *
2315_PyUnicode_FromId(_Py_Identifier *id)
2316{
Victor Stinner297257f2020-06-02 14:39:45 +02002317 if (id->object) {
2318 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002319 }
Victor Stinner297257f2020-06-02 14:39:45 +02002320
2321 PyObject *obj;
2322 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2323 strlen(id->string),
2324 NULL, NULL);
2325 if (!obj) {
2326 return NULL;
2327 }
2328 PyUnicode_InternInPlace(&obj);
2329
2330 assert(!id->next);
2331 id->object = obj;
2332 id->next = static_strings;
2333 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002334 return id->object;
2335}
2336
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002337static void
2338unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002339{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002340 _Py_Identifier *tmp, *s = static_strings;
2341 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002342 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002343 tmp = s->next;
2344 s->next = NULL;
2345 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002346 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002347 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002348}
2349
Benjamin Peterson0df54292012-03-26 14:50:32 -04002350/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002351
Victor Stinnerd3f08822012-05-29 12:57:52 +02002352PyObject*
2353_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002354{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002355 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002356 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002357 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002358#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002359 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002360#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002361 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002362 }
Victor Stinner785938e2011-12-11 20:09:03 +01002363 unicode = PyUnicode_New(size, 127);
2364 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002365 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002366 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2367 assert(_PyUnicode_CheckConsistency(unicode, 1));
2368 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002369}
2370
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002371static Py_UCS4
2372kind_maxchar_limit(unsigned int kind)
2373{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002374 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002375 case PyUnicode_1BYTE_KIND:
2376 return 0x80;
2377 case PyUnicode_2BYTE_KIND:
2378 return 0x100;
2379 case PyUnicode_4BYTE_KIND:
2380 return 0x10000;
2381 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002382 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002383 }
2384}
2385
Victor Stinner702c7342011-10-05 13:50:52 +02002386static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002387_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002390 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002391
Victor Stinner2f9ada92020-06-24 02:22:21 +02002392 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002393 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002394 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002395 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002396 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002397 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002398 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002399
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002400 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002401 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 if (!res)
2403 return NULL;
2404 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002405 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002407}
2408
Victor Stinnere57b1c02011-09-28 22:20:48 +02002409static PyObject*
2410_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411{
2412 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002413 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002414
Serhiy Storchaka678db842013-01-26 12:16:36 +02002415 if (size == 0)
2416 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002417 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002418 if (size == 1)
2419 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002420
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002421 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002422 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 if (!res)
2424 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002425 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002427 else {
2428 _PyUnicode_CONVERT_BYTES(
2429 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2430 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002431 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 return res;
2433}
2434
Victor Stinnere57b1c02011-09-28 22:20:48 +02002435static PyObject*
2436_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437{
2438 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002439 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002440
Serhiy Storchaka678db842013-01-26 12:16:36 +02002441 if (size == 0)
2442 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002443 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002444 if (size == 1)
2445 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002446
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002447 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002448 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 if (!res)
2450 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002451 if (max_char < 256)
2452 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2453 PyUnicode_1BYTE_DATA(res));
2454 else if (max_char < 0x10000)
2455 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2456 PyUnicode_2BYTE_DATA(res));
2457 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002459 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 return res;
2461}
2462
2463PyObject*
2464PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2465{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002466 if (size < 0) {
2467 PyErr_SetString(PyExc_ValueError, "size must be positive");
2468 return NULL;
2469 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002470 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002472 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002474 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002476 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002477 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002478 PyErr_SetString(PyExc_SystemError, "invalid kind");
2479 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481}
2482
Victor Stinnerece58de2012-04-23 23:36:38 +02002483Py_UCS4
2484_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2485{
2486 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002487 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002488
2489 assert(PyUnicode_IS_READY(unicode));
2490 assert(0 <= start);
2491 assert(end <= PyUnicode_GET_LENGTH(unicode));
2492 assert(start <= end);
2493
2494 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2495 return PyUnicode_MAX_CHAR_VALUE(unicode);
2496
2497 if (start == end)
2498 return 127;
2499
Victor Stinner94d558b2012-04-27 22:26:58 +02002500 if (PyUnicode_IS_ASCII(unicode))
2501 return 127;
2502
Victor Stinnerece58de2012-04-23 23:36:38 +02002503 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002504 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002505 endptr = (char *)startptr + end * kind;
2506 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002507 switch(kind) {
2508 case PyUnicode_1BYTE_KIND:
2509 return ucs1lib_find_max_char(startptr, endptr);
2510 case PyUnicode_2BYTE_KIND:
2511 return ucs2lib_find_max_char(startptr, endptr);
2512 case PyUnicode_4BYTE_KIND:
2513 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002514 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002515 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002516 }
2517}
2518
Victor Stinner25a4b292011-10-06 12:31:55 +02002519/* Ensure that a string uses the most efficient storage, if it is not the
2520 case: create a new string with of the right kind. Write NULL into *p_unicode
2521 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002522static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002523unicode_adjust_maxchar(PyObject **p_unicode)
2524{
2525 PyObject *unicode, *copy;
2526 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002527 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002528 unsigned int kind;
2529
2530 assert(p_unicode != NULL);
2531 unicode = *p_unicode;
2532 assert(PyUnicode_IS_READY(unicode));
2533 if (PyUnicode_IS_ASCII(unicode))
2534 return;
2535
2536 len = PyUnicode_GET_LENGTH(unicode);
2537 kind = PyUnicode_KIND(unicode);
2538 if (kind == PyUnicode_1BYTE_KIND) {
2539 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002540 max_char = ucs1lib_find_max_char(u, u + len);
2541 if (max_char >= 128)
2542 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002543 }
2544 else if (kind == PyUnicode_2BYTE_KIND) {
2545 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002546 max_char = ucs2lib_find_max_char(u, u + len);
2547 if (max_char >= 256)
2548 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002549 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002550 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002551 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002552 max_char = ucs4lib_find_max_char(u, u + len);
2553 if (max_char >= 0x10000)
2554 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002555 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002556 else
2557 Py_UNREACHABLE();
2558
Victor Stinner25a4b292011-10-06 12:31:55 +02002559 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002560 if (copy != NULL)
2561 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002562 Py_DECREF(unicode);
2563 *p_unicode = copy;
2564}
2565
Victor Stinner034f6cf2011-09-30 02:26:44 +02002566PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002567_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002568{
Victor Stinner87af4f22011-11-21 23:03:47 +01002569 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002570 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002571
Victor Stinner034f6cf2011-09-30 02:26:44 +02002572 if (!PyUnicode_Check(unicode)) {
2573 PyErr_BadInternalCall();
2574 return NULL;
2575 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002576 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002577 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002578
Victor Stinner87af4f22011-11-21 23:03:47 +01002579 length = PyUnicode_GET_LENGTH(unicode);
2580 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002581 if (!copy)
2582 return NULL;
2583 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2584
Christian Heimesf051e432016-09-13 20:22:02 +02002585 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002586 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002587 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002588 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002589}
2590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591
Victor Stinnerbc603d12011-10-02 01:00:40 +02002592/* Widen Unicode objects to larger buffers. Don't write terminating null
2593 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002595static void*
2596unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002598 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002599
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002600 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002601 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002602 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002603 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002604 if (!result)
2605 return PyErr_NoMemory();
2606 assert(skind == PyUnicode_1BYTE_KIND);
2607 _PyUnicode_CONVERT_BYTES(
2608 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002609 (const Py_UCS1 *)data,
2610 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002611 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002613 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002614 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002615 if (!result)
2616 return PyErr_NoMemory();
2617 if (skind == PyUnicode_2BYTE_KIND) {
2618 _PyUnicode_CONVERT_BYTES(
2619 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002620 (const Py_UCS2 *)data,
2621 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002622 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002624 else {
2625 assert(skind == PyUnicode_1BYTE_KIND);
2626 _PyUnicode_CONVERT_BYTES(
2627 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002628 (const Py_UCS1 *)data,
2629 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002630 result);
2631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002633 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002634 Py_UNREACHABLE();
2635 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637}
2638
2639static Py_UCS4*
2640as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2641 int copy_null)
2642{
2643 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002644 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 Py_ssize_t len, targetlen;
2646 if (PyUnicode_READY(string) == -1)
2647 return NULL;
2648 kind = PyUnicode_KIND(string);
2649 data = PyUnicode_DATA(string);
2650 len = PyUnicode_GET_LENGTH(string);
2651 targetlen = len;
2652 if (copy_null)
2653 targetlen++;
2654 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002655 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 if (!target) {
2657 PyErr_NoMemory();
2658 return NULL;
2659 }
2660 }
2661 else {
2662 if (targetsize < targetlen) {
2663 PyErr_Format(PyExc_SystemError,
2664 "string is longer than the buffer");
2665 if (copy_null && 0 < targetsize)
2666 target[0] = 0;
2667 return NULL;
2668 }
2669 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002670 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002671 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002672 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002674 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002675 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002676 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2677 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002678 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002679 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002680 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002681 else {
2682 Py_UNREACHABLE();
2683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 if (copy_null)
2685 target[len] = 0;
2686 return target;
2687}
2688
2689Py_UCS4*
2690PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2691 int copy_null)
2692{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002693 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 PyErr_BadInternalCall();
2695 return NULL;
2696 }
2697 return as_ucs4(string, target, targetsize, copy_null);
2698}
2699
2700Py_UCS4*
2701PyUnicode_AsUCS4Copy(PyObject *string)
2702{
2703 return as_ucs4(string, NULL, 0, 1);
2704}
2705
Victor Stinner15a11362012-10-06 23:48:20 +02002706/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002707 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2708 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2709#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002710
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711static int
2712unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2713 Py_ssize_t width, Py_ssize_t precision)
2714{
2715 Py_ssize_t length, fill, arglen;
2716 Py_UCS4 maxchar;
2717
2718 if (PyUnicode_READY(str) == -1)
2719 return -1;
2720
2721 length = PyUnicode_GET_LENGTH(str);
2722 if ((precision == -1 || precision >= length)
2723 && width <= length)
2724 return _PyUnicodeWriter_WriteStr(writer, str);
2725
2726 if (precision != -1)
2727 length = Py_MIN(precision, length);
2728
2729 arglen = Py_MAX(length, width);
2730 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2731 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2732 else
2733 maxchar = writer->maxchar;
2734
2735 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2736 return -1;
2737
2738 if (width > length) {
2739 fill = width - length;
2740 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2741 return -1;
2742 writer->pos += fill;
2743 }
2744
2745 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2746 str, 0, length);
2747 writer->pos += length;
2748 return 0;
2749}
2750
2751static int
Victor Stinner998b8062018-09-12 00:23:25 +02002752unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002753 Py_ssize_t width, Py_ssize_t precision)
2754{
2755 /* UTF-8 */
2756 Py_ssize_t length;
2757 PyObject *unicode;
2758 int res;
2759
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002760 if (precision == -1) {
2761 length = strlen(str);
2762 }
2763 else {
2764 length = 0;
2765 while (length < precision && str[length]) {
2766 length++;
2767 }
2768 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002769 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2770 if (unicode == NULL)
2771 return -1;
2772
2773 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2774 Py_DECREF(unicode);
2775 return res;
2776}
2777
Victor Stinner96865452011-03-01 23:44:09 +00002778static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002779unicode_fromformat_arg(_PyUnicodeWriter *writer,
2780 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002781{
Victor Stinnere215d962012-10-06 23:03:36 +02002782 const char *p;
2783 Py_ssize_t len;
2784 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002785 Py_ssize_t width;
2786 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002787 int longflag;
2788 int longlongflag;
2789 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002791
2792 p = f;
2793 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002794 zeropad = 0;
2795 if (*f == '0') {
2796 zeropad = 1;
2797 f++;
2798 }
Victor Stinner96865452011-03-01 23:44:09 +00002799
2800 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 width = -1;
2802 if (Py_ISDIGIT((unsigned)*f)) {
2803 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002804 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002805 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002806 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002807 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002808 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002809 return NULL;
2810 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002811 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002812 f++;
2813 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002814 }
2815 precision = -1;
2816 if (*f == '.') {
2817 f++;
2818 if (Py_ISDIGIT((unsigned)*f)) {
2819 precision = (*f - '0');
2820 f++;
2821 while (Py_ISDIGIT((unsigned)*f)) {
2822 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2823 PyErr_SetString(PyExc_ValueError,
2824 "precision too big");
2825 return NULL;
2826 }
2827 precision = (precision * 10) + (*f - '0');
2828 f++;
2829 }
2830 }
Victor Stinner96865452011-03-01 23:44:09 +00002831 if (*f == '%') {
2832 /* "%.3%s" => f points to "3" */
2833 f--;
2834 }
2835 }
2836 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002837 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002838 f--;
2839 }
Victor Stinner96865452011-03-01 23:44:09 +00002840
2841 /* Handle %ld, %lu, %lld and %llu. */
2842 longflag = 0;
2843 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002844 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002845 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002846 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002847 longflag = 1;
2848 ++f;
2849 }
Victor Stinner96865452011-03-01 23:44:09 +00002850 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002851 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002852 longlongflag = 1;
2853 f += 2;
2854 }
Victor Stinner96865452011-03-01 23:44:09 +00002855 }
2856 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002857 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002858 size_tflag = 1;
2859 ++f;
2860 }
Victor Stinnere215d962012-10-06 23:03:36 +02002861
2862 if (f[1] == '\0')
2863 writer->overallocate = 0;
2864
2865 switch (*f) {
2866 case 'c':
2867 {
2868 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002869 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002870 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002871 "character argument not in range(0x110000)");
2872 return NULL;
2873 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002874 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002875 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002876 break;
2877 }
2878
2879 case 'i':
2880 case 'd':
2881 case 'u':
2882 case 'x':
2883 {
2884 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002885 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002886 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002887
2888 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002889 if (longflag) {
2890 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2891 }
2892 else if (longlongflag) {
2893 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2894 }
2895 else if (size_tflag) {
2896 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2897 }
2898 else {
2899 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2900 }
Victor Stinnere215d962012-10-06 23:03:36 +02002901 }
2902 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002903 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002904 }
2905 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002906 if (longflag) {
2907 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2908 }
2909 else if (longlongflag) {
2910 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2911 }
2912 else if (size_tflag) {
2913 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2914 }
2915 else {
2916 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2917 }
Victor Stinnere215d962012-10-06 23:03:36 +02002918 }
2919 assert(len >= 0);
2920
Victor Stinnere215d962012-10-06 23:03:36 +02002921 if (precision < len)
2922 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002923
2924 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002925 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2926 return NULL;
2927
Victor Stinnere215d962012-10-06 23:03:36 +02002928 if (width > precision) {
2929 Py_UCS4 fillchar;
2930 fill = width - precision;
2931 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002932 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2933 return NULL;
2934 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002935 }
Victor Stinner15a11362012-10-06 23:48:20 +02002936 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002937 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002938 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2939 return NULL;
2940 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002941 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002942
Victor Stinner4a587072013-11-19 12:54:53 +01002943 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2944 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002945 break;
2946 }
2947
2948 case 'p':
2949 {
2950 char number[MAX_LONG_LONG_CHARS];
2951
2952 len = sprintf(number, "%p", va_arg(*vargs, void*));
2953 assert(len >= 0);
2954
2955 /* %p is ill-defined: ensure leading 0x. */
2956 if (number[1] == 'X')
2957 number[1] = 'x';
2958 else if (number[1] != 'x') {
2959 memmove(number + 2, number,
2960 strlen(number) + 1);
2961 number[0] = '0';
2962 number[1] = 'x';
2963 len += 2;
2964 }
2965
Victor Stinner4a587072013-11-19 12:54:53 +01002966 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002967 return NULL;
2968 break;
2969 }
2970
2971 case 's':
2972 {
2973 /* UTF-8 */
2974 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002975 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002976 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002977 break;
2978 }
2979
2980 case 'U':
2981 {
2982 PyObject *obj = va_arg(*vargs, PyObject *);
2983 assert(obj && _PyUnicode_CHECK(obj));
2984
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002985 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002986 return NULL;
2987 break;
2988 }
2989
2990 case 'V':
2991 {
2992 PyObject *obj = va_arg(*vargs, PyObject *);
2993 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002994 if (obj) {
2995 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002996 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002997 return NULL;
2998 }
2999 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003000 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003001 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003002 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003003 }
3004 break;
3005 }
3006
3007 case 'S':
3008 {
3009 PyObject *obj = va_arg(*vargs, PyObject *);
3010 PyObject *str;
3011 assert(obj);
3012 str = PyObject_Str(obj);
3013 if (!str)
3014 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003015 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003016 Py_DECREF(str);
3017 return NULL;
3018 }
3019 Py_DECREF(str);
3020 break;
3021 }
3022
3023 case 'R':
3024 {
3025 PyObject *obj = va_arg(*vargs, PyObject *);
3026 PyObject *repr;
3027 assert(obj);
3028 repr = PyObject_Repr(obj);
3029 if (!repr)
3030 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003031 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003032 Py_DECREF(repr);
3033 return NULL;
3034 }
3035 Py_DECREF(repr);
3036 break;
3037 }
3038
3039 case 'A':
3040 {
3041 PyObject *obj = va_arg(*vargs, PyObject *);
3042 PyObject *ascii;
3043 assert(obj);
3044 ascii = PyObject_ASCII(obj);
3045 if (!ascii)
3046 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003047 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003048 Py_DECREF(ascii);
3049 return NULL;
3050 }
3051 Py_DECREF(ascii);
3052 break;
3053 }
3054
3055 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003056 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003057 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003058 break;
3059
3060 default:
3061 /* if we stumble upon an unknown formatting code, copy the rest
3062 of the format string to the output string. (we cannot just
3063 skip the code, since there's no way to know what's in the
3064 argument list) */
3065 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003066 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003067 return NULL;
3068 f = p+len;
3069 return f;
3070 }
3071
3072 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003073 return f;
3074}
3075
Walter Dörwaldd2034312007-05-18 16:29:38 +00003076PyObject *
3077PyUnicode_FromFormatV(const char *format, va_list vargs)
3078{
Victor Stinnere215d962012-10-06 23:03:36 +02003079 va_list vargs2;
3080 const char *f;
3081 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003082
Victor Stinner8f674cc2013-04-17 23:02:17 +02003083 _PyUnicodeWriter_Init(&writer);
3084 writer.min_length = strlen(format) + 100;
3085 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003086
Benjamin Peterson0c212142016-09-20 20:39:33 -07003087 // Copy varags to be able to pass a reference to a subfunction.
3088 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003089
3090 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003091 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003092 f = unicode_fromformat_arg(&writer, f, &vargs2);
3093 if (f == NULL)
3094 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003096 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003097 const char *p;
3098 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003099
Victor Stinnere215d962012-10-06 23:03:36 +02003100 p = f;
3101 do
3102 {
3103 if ((unsigned char)*p > 127) {
3104 PyErr_Format(PyExc_ValueError,
3105 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3106 "string, got a non-ASCII byte: 0x%02x",
3107 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003108 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003109 }
3110 p++;
3111 }
3112 while (*p != '\0' && *p != '%');
3113 len = p - f;
3114
3115 if (*p == '\0')
3116 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003117
3118 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003119 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003120
3121 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003123 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003124 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003125 return _PyUnicodeWriter_Finish(&writer);
3126
3127 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003128 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003129 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003130 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003131}
3132
Walter Dörwaldd2034312007-05-18 16:29:38 +00003133PyObject *
3134PyUnicode_FromFormat(const char *format, ...)
3135{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 PyObject* ret;
3137 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003138
3139#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003140 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003141#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003142 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003143#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003144 ret = PyUnicode_FromFormatV(format, vargs);
3145 va_end(vargs);
3146 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003147}
3148
Serhiy Storchakac46db922018-10-23 22:58:24 +03003149static Py_ssize_t
3150unicode_get_widechar_size(PyObject *unicode)
3151{
3152 Py_ssize_t res;
3153
3154 assert(unicode != NULL);
3155 assert(_PyUnicode_CHECK(unicode));
3156
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003157#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003158 if (_PyUnicode_WSTR(unicode) != NULL) {
3159 return PyUnicode_WSTR_LENGTH(unicode);
3160 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003161#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003162 assert(PyUnicode_IS_READY(unicode));
3163
3164 res = _PyUnicode_LENGTH(unicode);
3165#if SIZEOF_WCHAR_T == 2
3166 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3167 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3168 const Py_UCS4 *end = s + res;
3169 for (; s < end; ++s) {
3170 if (*s > 0xFFFF) {
3171 ++res;
3172 }
3173 }
3174 }
3175#endif
3176 return res;
3177}
3178
3179static void
3180unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3181{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003182 assert(unicode != NULL);
3183 assert(_PyUnicode_CHECK(unicode));
3184
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003185#if USE_UNICODE_WCHAR_CACHE
3186 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003187 if (wstr != NULL) {
3188 memcpy(w, wstr, size * sizeof(wchar_t));
3189 return;
3190 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003191#else /* USE_UNICODE_WCHAR_CACHE */
3192 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3193 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3194 return;
3195 }
3196#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003197 assert(PyUnicode_IS_READY(unicode));
3198
3199 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3200 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3201 for (; size--; ++s, ++w) {
3202 *w = *s;
3203 }
3204 }
3205 else {
3206#if SIZEOF_WCHAR_T == 4
3207 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3208 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3209 for (; size--; ++s, ++w) {
3210 *w = *s;
3211 }
3212#else
3213 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3214 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3215 for (; size--; ++s, ++w) {
3216 Py_UCS4 ch = *s;
3217 if (ch > 0xFFFF) {
3218 assert(ch <= MAX_UNICODE);
3219 /* encode surrogate pair in this case */
3220 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3221 if (!size--)
3222 break;
3223 *w = Py_UNICODE_LOW_SURROGATE(ch);
3224 }
3225 else {
3226 *w = ch;
3227 }
3228 }
3229#endif
3230 }
3231}
3232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003233#ifdef HAVE_WCHAR_H
3234
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003235/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003236
Victor Stinnerd88d9832011-09-06 02:00:05 +02003237 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003238 character) required to convert the unicode object. Ignore size argument.
3239
Victor Stinnerd88d9832011-09-06 02:00:05 +02003240 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003241 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003242 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003243Py_ssize_t
3244PyUnicode_AsWideChar(PyObject *unicode,
3245 wchar_t *w,
3246 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003247{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003248 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003249
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003250 if (unicode == NULL) {
3251 PyErr_BadInternalCall();
3252 return -1;
3253 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003254 if (!PyUnicode_Check(unicode)) {
3255 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003256 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003257 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003258
3259 res = unicode_get_widechar_size(unicode);
3260 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003261 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003262 }
3263
3264 if (size > res) {
3265 size = res + 1;
3266 }
3267 else {
3268 res = size;
3269 }
3270 unicode_copy_as_widechar(unicode, w, size);
3271 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003272}
3273
Victor Stinner137c34c2010-09-29 10:25:54 +00003274wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003275PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003276 Py_ssize_t *size)
3277{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003278 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003279 Py_ssize_t buflen;
3280
3281 if (unicode == NULL) {
3282 PyErr_BadInternalCall();
3283 return NULL;
3284 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003285 if (!PyUnicode_Check(unicode)) {
3286 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003287 return NULL;
3288 }
3289
Serhiy Storchakac46db922018-10-23 22:58:24 +03003290 buflen = unicode_get_widechar_size(unicode);
3291 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003292 if (buffer == NULL) {
3293 PyErr_NoMemory();
3294 return NULL;
3295 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003296 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3297 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003298 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003299 }
3300 else if (wcslen(buffer) != (size_t)buflen) {
3301 PyMem_FREE(buffer);
3302 PyErr_SetString(PyExc_ValueError,
3303 "embedded null character");
3304 return NULL;
3305 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003306 return buffer;
3307}
3308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003309#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003311int
3312_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3313{
3314 wchar_t **p = (wchar_t **)ptr;
3315 if (obj == NULL) {
3316#if !USE_UNICODE_WCHAR_CACHE
3317 PyMem_Free(*p);
3318#endif /* USE_UNICODE_WCHAR_CACHE */
3319 *p = NULL;
3320 return 1;
3321 }
3322 if (PyUnicode_Check(obj)) {
3323#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003324 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3325 if (*p == NULL) {
3326 return 0;
3327 }
3328 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003329#else /* USE_UNICODE_WCHAR_CACHE */
3330 *p = PyUnicode_AsWideCharString(obj, NULL);
3331 if (*p == NULL) {
3332 return 0;
3333 }
3334 return Py_CLEANUP_SUPPORTED;
3335#endif /* USE_UNICODE_WCHAR_CACHE */
3336 }
3337 PyErr_Format(PyExc_TypeError,
3338 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003339 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003340 return 0;
3341}
3342
3343int
3344_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3345{
3346 wchar_t **p = (wchar_t **)ptr;
3347 if (obj == NULL) {
3348#if !USE_UNICODE_WCHAR_CACHE
3349 PyMem_Free(*p);
3350#endif /* USE_UNICODE_WCHAR_CACHE */
3351 *p = NULL;
3352 return 1;
3353 }
3354 if (obj == Py_None) {
3355 *p = NULL;
3356 return 1;
3357 }
3358 if (PyUnicode_Check(obj)) {
3359#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003360 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3361 if (*p == NULL) {
3362 return 0;
3363 }
3364 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003365#else /* USE_UNICODE_WCHAR_CACHE */
3366 *p = PyUnicode_AsWideCharString(obj, NULL);
3367 if (*p == NULL) {
3368 return 0;
3369 }
3370 return Py_CLEANUP_SUPPORTED;
3371#endif /* USE_UNICODE_WCHAR_CACHE */
3372 }
3373 PyErr_Format(PyExc_TypeError,
3374 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003375 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003376 return 0;
3377}
3378
Alexander Belopolsky40018472011-02-26 01:02:56 +00003379PyObject *
3380PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003381{
Victor Stinner8faf8212011-12-08 22:14:11 +01003382 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003383 PyErr_SetString(PyExc_ValueError,
3384 "chr() arg not in range(0x110000)");
3385 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003386 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003387
Victor Stinner985a82a2014-01-03 12:53:47 +01003388 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003389}
3390
Alexander Belopolsky40018472011-02-26 01:02:56 +00003391PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003392PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003394 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003396 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003397 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003398 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 Py_INCREF(obj);
3400 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003401 }
3402 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003403 /* For a Unicode subtype that's not a Unicode object,
3404 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003405 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003406 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003407 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003408 "Can't convert '%.100s' object to str implicitly",
3409 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003410 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003411}
3412
Alexander Belopolsky40018472011-02-26 01:02:56 +00003413PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003414PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003415 const char *encoding,
3416 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003417{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003418 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003419 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003420
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 PyErr_BadInternalCall();
3423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003425
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003426 /* Decoding bytes objects is the most common case and should be fast */
3427 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003428 if (PyBytes_GET_SIZE(obj) == 0) {
3429 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3430 return NULL;
3431 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003433 }
3434 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003435 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3436 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003437 }
3438
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003439 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003440 PyErr_SetString(PyExc_TypeError,
3441 "decoding str is not supported");
3442 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003443 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003444
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003445 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3446 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3447 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003448 "decoding to str: need a bytes-like object, %.80s found",
3449 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003450 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003451 }
Tim Petersced69f82003-09-16 20:30:58 +00003452
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003453 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003454 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003455 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3456 return NULL;
3457 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003458 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003460
Serhiy Storchaka05997252013-01-26 12:14:02 +02003461 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003462 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003463 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464}
3465
Victor Stinnerebe17e02016-10-12 13:57:45 +02003466/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3467 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3468 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003469int
3470_Py_normalize_encoding(const char *encoding,
3471 char *lower,
3472 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003474 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003475 char *l;
3476 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003477 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478
Victor Stinner942889a2016-09-05 15:40:10 -07003479 assert(encoding != NULL);
3480
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003481 e = encoding;
3482 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003483 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003484 punct = 0;
3485 while (1) {
3486 char c = *e;
3487 if (c == 0) {
3488 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003489 }
Victor Stinner942889a2016-09-05 15:40:10 -07003490
3491 if (Py_ISALNUM(c) || c == '.') {
3492 if (punct && l != lower) {
3493 if (l == l_end) {
3494 return 0;
3495 }
3496 *l++ = '_';
3497 }
3498 punct = 0;
3499
3500 if (l == l_end) {
3501 return 0;
3502 }
3503 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003504 }
3505 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003506 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003507 }
Victor Stinner942889a2016-09-05 15:40:10 -07003508
3509 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003510 }
3511 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003512 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003513}
3514
Alexander Belopolsky40018472011-02-26 01:02:56 +00003515PyObject *
3516PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003517 Py_ssize_t size,
3518 const char *encoding,
3519 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003520{
3521 PyObject *buffer = NULL, *unicode;
3522 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003523 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3524
Victor Stinner22eb6892019-06-26 00:51:05 +02003525 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3526 return NULL;
3527 }
3528
Victor Stinnered076ed2019-06-26 01:49:32 +02003529 if (size == 0) {
3530 _Py_RETURN_UNICODE_EMPTY();
3531 }
3532
Victor Stinner942889a2016-09-05 15:40:10 -07003533 if (encoding == NULL) {
3534 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3535 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003536
Fred Drakee4315f52000-05-09 19:53:39 +00003537 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003538 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3539 char *lower = buflower;
3540
3541 /* Fast paths */
3542 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3543 lower += 3;
3544 if (*lower == '_') {
3545 /* Match "utf8" and "utf_8" */
3546 lower++;
3547 }
3548
3549 if (lower[0] == '8' && lower[1] == 0) {
3550 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3551 }
3552 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3553 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3554 }
3555 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3556 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3557 }
3558 }
3559 else {
3560 if (strcmp(lower, "ascii") == 0
3561 || strcmp(lower, "us_ascii") == 0) {
3562 return PyUnicode_DecodeASCII(s, size, errors);
3563 }
Steve Dowercc16be82016-09-08 10:35:16 -07003564 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003565 else if (strcmp(lower, "mbcs") == 0) {
3566 return PyUnicode_DecodeMBCS(s, size, errors);
3567 }
3568 #endif
3569 else if (strcmp(lower, "latin1") == 0
3570 || strcmp(lower, "latin_1") == 0
3571 || strcmp(lower, "iso_8859_1") == 0
3572 || strcmp(lower, "iso8859_1") == 0) {
3573 return PyUnicode_DecodeLatin1(s, size, errors);
3574 }
3575 }
Victor Stinner37296e82010-06-10 13:36:23 +00003576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577
3578 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003579 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003580 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003581 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003582 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 if (buffer == NULL)
3584 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003585 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 if (unicode == NULL)
3587 goto onError;
3588 if (!PyUnicode_Check(unicode)) {
3589 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003590 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003591 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003592 encoding,
3593 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 Py_DECREF(unicode);
3595 goto onError;
3596 }
3597 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003598 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003599
Benjamin Peterson29060642009-01-31 22:14:21 +00003600 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 Py_XDECREF(buffer);
3602 return NULL;
3603}
3604
Alexander Belopolsky40018472011-02-26 01:02:56 +00003605PyObject *
3606PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003607 const char *encoding,
3608 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003609{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003610 if (!PyUnicode_Check(unicode)) {
3611 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003612 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003613 }
3614
Serhiy Storchaka00939072016-10-27 21:05:49 +03003615 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3616 "PyUnicode_AsDecodedObject() is deprecated; "
3617 "use PyCodec_Decode() to decode from str", 1) < 0)
3618 return NULL;
3619
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003620 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003622
3623 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003624 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003625}
3626
Alexander Belopolsky40018472011-02-26 01:02:56 +00003627PyObject *
3628PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003629 const char *encoding,
3630 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003631{
3632 PyObject *v;
3633
3634 if (!PyUnicode_Check(unicode)) {
3635 PyErr_BadArgument();
3636 goto onError;
3637 }
3638
Serhiy Storchaka00939072016-10-27 21:05:49 +03003639 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3640 "PyUnicode_AsDecodedUnicode() is deprecated; "
3641 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3642 return NULL;
3643
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003644 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003646
3647 /* Decode via the codec registry */
3648 v = PyCodec_Decode(unicode, encoding, errors);
3649 if (v == NULL)
3650 goto onError;
3651 if (!PyUnicode_Check(v)) {
3652 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003653 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003654 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003655 encoding,
3656 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003657 Py_DECREF(v);
3658 goto onError;
3659 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003660 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003663 return NULL;
3664}
3665
Alexander Belopolsky40018472011-02-26 01:02:56 +00003666PyObject *
3667PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003668 Py_ssize_t size,
3669 const char *encoding,
3670 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671{
3672 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003673
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003674 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3678 Py_DECREF(unicode);
3679 return v;
3680}
3681
Alexander Belopolsky40018472011-02-26 01:02:56 +00003682PyObject *
3683PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003684 const char *encoding,
3685 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003686{
3687 PyObject *v;
3688
3689 if (!PyUnicode_Check(unicode)) {
3690 PyErr_BadArgument();
3691 goto onError;
3692 }
3693
Serhiy Storchaka00939072016-10-27 21:05:49 +03003694 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3695 "PyUnicode_AsEncodedObject() is deprecated; "
3696 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3697 "or PyCodec_Encode() for generic encoding", 1) < 0)
3698 return NULL;
3699
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003700 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003702
3703 /* Encode via the codec registry */
3704 v = PyCodec_Encode(unicode, encoding, errors);
3705 if (v == NULL)
3706 goto onError;
3707 return v;
3708
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003710 return NULL;
3711}
3712
Victor Stinner1b579672011-12-17 05:47:23 +01003713
Victor Stinner2cba6b82018-01-10 22:46:15 +01003714static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003715unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003716 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003717{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 Py_ssize_t wlen;
3719 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3720 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003721 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003722 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003723
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003724 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003725 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003726 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003727 return NULL;
3728 }
3729
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003730 char *str;
3731 size_t error_pos;
3732 const char *reason;
3733 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003734 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003735 PyMem_Free(wstr);
3736
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003737 if (res != 0) {
3738 if (res == -2) {
3739 PyObject *exc;
3740 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3741 "locale", unicode,
3742 (Py_ssize_t)error_pos,
3743 (Py_ssize_t)(error_pos+1),
3744 reason);
3745 if (exc != NULL) {
3746 PyCodec_StrictErrors(exc);
3747 Py_DECREF(exc);
3748 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003749 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003750 else if (res == -3) {
3751 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3752 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003753 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003754 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003755 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003756 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003757 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003758
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003759 PyObject *bytes = PyBytes_FromString(str);
3760 PyMem_RawFree(str);
3761 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003762}
3763
Victor Stinnerad158722010-10-27 00:25:46 +00003764PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003765PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3766{
Victor Stinner709d23d2019-05-02 14:56:30 -04003767 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3768 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003769}
3770
3771PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003772PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003773{
Victor Stinner81a7be32020-04-14 15:14:01 +02003774 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003775 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3776 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003777 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003778 fs_codec->error_handler,
3779 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003780 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003781#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003782 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003783 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003784 fs_codec->encoding,
3785 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003786 }
Victor Stinnerad158722010-10-27 00:25:46 +00003787#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003788 else {
3789 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3790 machinery is not ready and so cannot be used:
3791 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003792 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3793 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003794 assert(filesystem_errors != NULL);
3795 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3796 assert(errors != _Py_ERROR_UNKNOWN);
3797#ifdef _Py_FORCE_UTF8_FS_ENCODING
3798 return unicode_encode_utf8(unicode, errors, NULL);
3799#else
3800 return unicode_encode_locale(unicode, errors, 0);
3801#endif
3802 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003803}
3804
Alexander Belopolsky40018472011-02-26 01:02:56 +00003805PyObject *
3806PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003807 const char *encoding,
3808 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809{
3810 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003811 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003812
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 if (!PyUnicode_Check(unicode)) {
3814 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 }
Fred Drakee4315f52000-05-09 19:53:39 +00003817
Victor Stinner22eb6892019-06-26 00:51:05 +02003818 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3819 return NULL;
3820 }
3821
Victor Stinner942889a2016-09-05 15:40:10 -07003822 if (encoding == NULL) {
3823 return _PyUnicode_AsUTF8String(unicode, errors);
3824 }
3825
Fred Drakee4315f52000-05-09 19:53:39 +00003826 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003827 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3828 char *lower = buflower;
3829
3830 /* Fast paths */
3831 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3832 lower += 3;
3833 if (*lower == '_') {
3834 /* Match "utf8" and "utf_8" */
3835 lower++;
3836 }
3837
3838 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003840 }
3841 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3842 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3843 }
3844 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3845 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3846 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003847 }
Victor Stinner942889a2016-09-05 15:40:10 -07003848 else {
3849 if (strcmp(lower, "ascii") == 0
3850 || strcmp(lower, "us_ascii") == 0) {
3851 return _PyUnicode_AsASCIIString(unicode, errors);
3852 }
Steve Dowercc16be82016-09-08 10:35:16 -07003853#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003854 else if (strcmp(lower, "mbcs") == 0) {
3855 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3856 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003857#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003858 else if (strcmp(lower, "latin1") == 0 ||
3859 strcmp(lower, "latin_1") == 0 ||
3860 strcmp(lower, "iso_8859_1") == 0 ||
3861 strcmp(lower, "iso8859_1") == 0) {
3862 return _PyUnicode_AsLatin1String(unicode, errors);
3863 }
3864 }
Victor Stinner37296e82010-06-10 13:36:23 +00003865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866
3867 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003868 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003870 return NULL;
3871
3872 /* The normal path */
3873 if (PyBytes_Check(v))
3874 return v;
3875
3876 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003877 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003878 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003879 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003880
3881 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003882 "encoder %s returned bytearray instead of bytes; "
3883 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003884 encoding);
3885 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003886 Py_DECREF(v);
3887 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003888 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003889
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003890 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3891 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003892 Py_DECREF(v);
3893 return b;
3894 }
3895
3896 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003897 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003898 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003899 encoding,
3900 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003901 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003902 return NULL;
3903}
3904
Alexander Belopolsky40018472011-02-26 01:02:56 +00003905PyObject *
3906PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003907 const char *encoding,
3908 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003909{
3910 PyObject *v;
3911
3912 if (!PyUnicode_Check(unicode)) {
3913 PyErr_BadArgument();
3914 goto onError;
3915 }
3916
Serhiy Storchaka00939072016-10-27 21:05:49 +03003917 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3918 "PyUnicode_AsEncodedUnicode() is deprecated; "
3919 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3920 return NULL;
3921
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003922 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003923 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003924
3925 /* Encode via the codec registry */
3926 v = PyCodec_Encode(unicode, encoding, errors);
3927 if (v == NULL)
3928 goto onError;
3929 if (!PyUnicode_Check(v)) {
3930 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003931 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003932 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003933 encoding,
3934 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003935 Py_DECREF(v);
3936 goto onError;
3937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003938 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003939
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 return NULL;
3942}
3943
Victor Stinner2cba6b82018-01-10 22:46:15 +01003944static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003945unicode_decode_locale(const char *str, Py_ssize_t len,
3946 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003947{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003948 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3949 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003950 return NULL;
3951 }
3952
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003953 wchar_t *wstr;
3954 size_t wlen;
3955 const char *reason;
3956 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003957 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003958 if (res != 0) {
3959 if (res == -2) {
3960 PyObject *exc;
3961 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3962 "locale", str, len,
3963 (Py_ssize_t)wlen,
3964 (Py_ssize_t)(wlen + 1),
3965 reason);
3966 if (exc != NULL) {
3967 PyCodec_StrictErrors(exc);
3968 Py_DECREF(exc);
3969 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003970 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003971 else if (res == -3) {
3972 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3973 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003974 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003975 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003976 }
Victor Stinner2f197072011-12-17 07:08:30 +01003977 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003978 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003979
3980 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3981 PyMem_RawFree(wstr);
3982 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003983}
3984
3985PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003986PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3987 const char *errors)
3988{
Victor Stinner709d23d2019-05-02 14:56:30 -04003989 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3990 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003991}
3992
3993PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003994PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003995{
3996 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003997 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3998 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003999}
4000
4001
4002PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004003PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004004 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004005 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4006}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004007
Christian Heimes5894ba72007-11-04 11:43:14 +00004008PyObject*
4009PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4010{
Victor Stinner81a7be32020-04-14 15:14:01 +02004011 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004012 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4013 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004014 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004015 fs_codec->error_handler,
4016 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004017 NULL);
4018 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004019#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004020 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004021 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004022 fs_codec->encoding,
4023 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004024 }
Victor Stinnerad158722010-10-27 00:25:46 +00004025#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004026 else {
4027 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4028 machinery is not ready and so cannot be used:
4029 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004030 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4031 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004032 assert(filesystem_errors != NULL);
4033 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4034 assert(errors != _Py_ERROR_UNKNOWN);
4035#ifdef _Py_FORCE_UTF8_FS_ENCODING
4036 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4037#else
4038 return unicode_decode_locale(s, size, errors, 0);
4039#endif
4040 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004041}
4042
Martin v. Löwis011e8422009-05-05 04:43:17 +00004043
4044int
4045PyUnicode_FSConverter(PyObject* arg, void* addr)
4046{
Brett Cannonec6ce872016-09-06 15:50:29 -07004047 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004048 PyObject *output = NULL;
4049 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004050 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004051 if (arg == NULL) {
4052 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004053 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004054 return 1;
4055 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004056 path = PyOS_FSPath(arg);
4057 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004058 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004059 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004060 if (PyBytes_Check(path)) {
4061 output = path;
4062 }
4063 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4064 output = PyUnicode_EncodeFSDefault(path);
4065 Py_DECREF(path);
4066 if (!output) {
4067 return 0;
4068 }
4069 assert(PyBytes_Check(output));
4070 }
4071
Victor Stinner0ea2a462010-04-30 00:22:08 +00004072 size = PyBytes_GET_SIZE(output);
4073 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004074 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004075 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004076 Py_DECREF(output);
4077 return 0;
4078 }
4079 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004080 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004081}
4082
4083
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004084int
4085PyUnicode_FSDecoder(PyObject* arg, void* addr)
4086{
Brett Cannona5711202016-09-06 19:36:01 -07004087 int is_buffer = 0;
4088 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004089 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004090 if (arg == NULL) {
4091 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004092 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004093 return 1;
4094 }
Brett Cannona5711202016-09-06 19:36:01 -07004095
4096 is_buffer = PyObject_CheckBuffer(arg);
4097 if (!is_buffer) {
4098 path = PyOS_FSPath(arg);
4099 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004100 return 0;
4101 }
Brett Cannona5711202016-09-06 19:36:01 -07004102 }
4103 else {
4104 path = arg;
4105 Py_INCREF(arg);
4106 }
4107
4108 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004109 output = path;
4110 }
4111 else if (PyBytes_Check(path) || is_buffer) {
4112 PyObject *path_bytes = NULL;
4113
4114 if (!PyBytes_Check(path) &&
4115 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004116 "path should be string, bytes, or os.PathLike, not %.200s",
4117 Py_TYPE(arg)->tp_name)) {
4118 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004119 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004120 }
4121 path_bytes = PyBytes_FromObject(path);
4122 Py_DECREF(path);
4123 if (!path_bytes) {
4124 return 0;
4125 }
4126 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4127 PyBytes_GET_SIZE(path_bytes));
4128 Py_DECREF(path_bytes);
4129 if (!output) {
4130 return 0;
4131 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004132 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004133 else {
4134 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004135 "path should be string, bytes, or os.PathLike, not %.200s",
4136 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004137 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004138 return 0;
4139 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004140 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004141 Py_DECREF(output);
4142 return 0;
4143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004145 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004146 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004147 Py_DECREF(output);
4148 return 0;
4149 }
4150 *(PyObject**)addr = output;
4151 return Py_CLEANUP_SUPPORTED;
4152}
4153
4154
Inada Naoki02a4d572020-02-27 13:48:59 +09004155static int unicode_fill_utf8(PyObject *unicode);
4156
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004157const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004159{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004160 if (!PyUnicode_Check(unicode)) {
4161 PyErr_BadArgument();
4162 return NULL;
4163 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004164 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004165 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004167 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004168 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 return NULL;
4170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 }
4172
4173 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004174 *psize = PyUnicode_UTF8_LENGTH(unicode);
4175 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004176}
4177
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004178const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4182}
4183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184Py_UNICODE *
4185PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4186{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004187 if (!PyUnicode_Check(unicode)) {
4188 PyErr_BadArgument();
4189 return NULL;
4190 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004191 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4192 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004194 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004195 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196
Serhiy Storchakac46db922018-10-23 22:58:24 +03004197 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4198 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4199 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004200 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004202 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4203 if (w == NULL) {
4204 PyErr_NoMemory();
4205 return NULL;
4206 }
4207 unicode_copy_as_widechar(unicode, w, wlen + 1);
4208 _PyUnicode_WSTR(unicode) = w;
4209 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4210 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 }
4212 }
4213 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004214 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004215 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004216}
4217
Inada Naoki2c4928d2020-06-17 20:09:44 +09004218/* Deprecated APIs */
4219
4220_Py_COMP_DIAG_PUSH
4221_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4222
Alexander Belopolsky40018472011-02-26 01:02:56 +00004223Py_UNICODE *
4224PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227}
4228
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004229const Py_UNICODE *
4230_PyUnicode_AsUnicode(PyObject *unicode)
4231{
4232 Py_ssize_t size;
4233 const Py_UNICODE *wstr;
4234
4235 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4236 if (wstr && wcslen(wstr) != (size_t)size) {
4237 PyErr_SetString(PyExc_ValueError, "embedded null character");
4238 return NULL;
4239 }
4240 return wstr;
4241}
4242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243
Alexander Belopolsky40018472011-02-26 01:02:56 +00004244Py_ssize_t
4245PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246{
4247 if (!PyUnicode_Check(unicode)) {
4248 PyErr_BadArgument();
4249 goto onError;
4250 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004251 if (_PyUnicode_WSTR(unicode) == NULL) {
4252 if (PyUnicode_AsUnicode(unicode) == NULL)
4253 goto onError;
4254 }
4255 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 return -1;
4259}
4260
Inada Naoki2c4928d2020-06-17 20:09:44 +09004261_Py_COMP_DIAG_POP
4262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263Py_ssize_t
4264PyUnicode_GetLength(PyObject *unicode)
4265{
Victor Stinner07621332012-06-16 04:53:46 +02004266 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 PyErr_BadArgument();
4268 return -1;
4269 }
Victor Stinner07621332012-06-16 04:53:46 +02004270 if (PyUnicode_READY(unicode) == -1)
4271 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004272 return PyUnicode_GET_LENGTH(unicode);
4273}
4274
4275Py_UCS4
4276PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4277{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004278 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004279 int kind;
4280
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004281 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004282 PyErr_BadArgument();
4283 return (Py_UCS4)-1;
4284 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004285 if (PyUnicode_READY(unicode) == -1) {
4286 return (Py_UCS4)-1;
4287 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004288 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004289 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290 return (Py_UCS4)-1;
4291 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004292 data = PyUnicode_DATA(unicode);
4293 kind = PyUnicode_KIND(unicode);
4294 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295}
4296
4297int
4298PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4299{
4300 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004301 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004302 return -1;
4303 }
Victor Stinner488fa492011-12-12 00:01:39 +01004304 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004305 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004306 PyErr_SetString(PyExc_IndexError, "string index out of range");
4307 return -1;
4308 }
Victor Stinner488fa492011-12-12 00:01:39 +01004309 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004310 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004311 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4312 PyErr_SetString(PyExc_ValueError, "character out of range");
4313 return -1;
4314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004315 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4316 index, ch);
4317 return 0;
4318}
4319
Alexander Belopolsky40018472011-02-26 01:02:56 +00004320const char *
4321PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004322{
Victor Stinner42cb4622010-09-01 19:39:01 +00004323 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004324}
4325
Victor Stinner554f3f02010-06-16 23:33:54 +00004326/* create or adjust a UnicodeDecodeError */
4327static void
4328make_decode_exception(PyObject **exceptionObject,
4329 const char *encoding,
4330 const char *input, Py_ssize_t length,
4331 Py_ssize_t startpos, Py_ssize_t endpos,
4332 const char *reason)
4333{
4334 if (*exceptionObject == NULL) {
4335 *exceptionObject = PyUnicodeDecodeError_Create(
4336 encoding, input, length, startpos, endpos, reason);
4337 }
4338 else {
4339 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4340 goto onError;
4341 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4342 goto onError;
4343 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4344 goto onError;
4345 }
4346 return;
4347
4348onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004349 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004350}
4351
Steve Dowercc16be82016-09-08 10:35:16 -07004352#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004353static int
4354widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4355{
4356 if (newsize > *size) {
4357 wchar_t *newbuf = *buf;
4358 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4359 PyErr_NoMemory();
4360 return -1;
4361 }
4362 *buf = newbuf;
4363 }
4364 *size = newsize;
4365 return 0;
4366}
4367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368/* error handling callback helper:
4369 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004370 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 and adjust various state variables.
4372 return 0 on success, -1 on error
4373*/
4374
Alexander Belopolsky40018472011-02-26 01:02:56 +00004375static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004376unicode_decode_call_errorhandler_wchar(
4377 const char *errors, PyObject **errorHandler,
4378 const char *encoding, const char *reason,
4379 const char **input, const char **inend, Py_ssize_t *startinpos,
4380 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004381 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004383 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384
4385 PyObject *restuple = NULL;
4386 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004387 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004388 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004389 Py_ssize_t requiredsize;
4390 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004391 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393
4394 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004395 *errorHandler = PyCodec_LookupError(errors);
4396 if (*errorHandler == NULL)
4397 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 }
4399
Victor Stinner554f3f02010-06-16 23:33:54 +00004400 make_decode_exception(exceptionObject,
4401 encoding,
4402 *input, *inend - *input,
4403 *startinpos, *endinpos,
4404 reason);
4405 if (*exceptionObject == NULL)
4406 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407
Petr Viktorinffd97532020-02-11 17:46:57 +01004408 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004412 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004415 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004417
4418 /* Copy back the bytes variables, which might have been modified by the
4419 callback */
4420 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4421 if (!inputobj)
4422 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004423 *input = PyBytes_AS_STRING(inputobj);
4424 insize = PyBytes_GET_SIZE(inputobj);
4425 *inend = *input + insize;
4426 /* we can DECREF safely, as the exception has another reference,
4427 so the object won't go away. */
4428 Py_DECREF(inputobj);
4429
4430 if (newpos<0)
4431 newpos = insize+newpos;
4432 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004433 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004434 goto onError;
4435 }
4436
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004437#if USE_UNICODE_WCHAR_CACHE
4438_Py_COMP_DIAG_PUSH
4439_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4440 repwlen = PyUnicode_GetSize(repunicode);
4441 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004442 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004443_Py_COMP_DIAG_POP
4444#else /* USE_UNICODE_WCHAR_CACHE */
4445 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4446 if (repwlen < 0)
4447 goto onError;
4448 repwlen--;
4449#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 /* need more space? (at least enough for what we
4451 have+the replacement+the rest of the string (starting
4452 at the new input position), so we won't have to check space
4453 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004454 requiredsize = *outpos;
4455 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4456 goto overflow;
4457 requiredsize += repwlen;
4458 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4459 goto overflow;
4460 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004461 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004463 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004464 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004465 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004467 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004468 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004469 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004470 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004471 *endinpos = newpos;
4472 *inptr = *input + newpos;
4473
4474 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004475 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004476 return 0;
4477
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004478 overflow:
4479 PyErr_SetString(PyExc_OverflowError,
4480 "decoded result is too long for a Python string");
4481
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004482 onError:
4483 Py_XDECREF(restuple);
4484 return -1;
4485}
Steve Dowercc16be82016-09-08 10:35:16 -07004486#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004487
4488static int
4489unicode_decode_call_errorhandler_writer(
4490 const char *errors, PyObject **errorHandler,
4491 const char *encoding, const char *reason,
4492 const char **input, const char **inend, Py_ssize_t *startinpos,
4493 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4494 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4495{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004496 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497
4498 PyObject *restuple = NULL;
4499 PyObject *repunicode = NULL;
4500 Py_ssize_t insize;
4501 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004502 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004503 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004504 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004505 int need_to_grow = 0;
4506 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004507
4508 if (*errorHandler == NULL) {
4509 *errorHandler = PyCodec_LookupError(errors);
4510 if (*errorHandler == NULL)
4511 goto onError;
4512 }
4513
4514 make_decode_exception(exceptionObject,
4515 encoding,
4516 *input, *inend - *input,
4517 *startinpos, *endinpos,
4518 reason);
4519 if (*exceptionObject == NULL)
4520 goto onError;
4521
Petr Viktorinffd97532020-02-11 17:46:57 +01004522 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004523 if (restuple == NULL)
4524 goto onError;
4525 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004526 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004527 goto onError;
4528 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004529 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004530 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004531
4532 /* Copy back the bytes variables, which might have been modified by the
4533 callback */
4534 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4535 if (!inputobj)
4536 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004537 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004538 *input = PyBytes_AS_STRING(inputobj);
4539 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004540 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004541 /* we can DECREF safely, as the exception has another reference,
4542 so the object won't go away. */
4543 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004544
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004547 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004548 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004550 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551
Victor Stinner170ca6f2013-04-18 00:25:28 +02004552 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004553 if (replen > 1) {
4554 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004555 need_to_grow = 1;
4556 }
4557 new_inptr = *input + newpos;
4558 if (*inend - new_inptr > remain) {
4559 /* We don't know the decoding algorithm here so we make the worst
4560 assumption that one byte decodes to one unicode character.
4561 If unfortunately one byte could decode to more unicode characters,
4562 the decoder may write out-of-bound then. Is it possible for the
4563 algorithms using this function? */
4564 writer->min_length += *inend - new_inptr - remain;
4565 need_to_grow = 1;
4566 }
4567 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004568 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004569 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004570 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4571 goto onError;
4572 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004574 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004577 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004578
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004580 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004585 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586}
4587
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588/* --- UTF-7 Codec -------------------------------------------------------- */
4589
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590/* See RFC2152 for details. We encode conservatively and decode liberally. */
4591
4592/* Three simple macros defining base-64. */
4593
4594/* Is c a base-64 character? */
4595
4596#define IS_BASE64(c) \
4597 (((c) >= 'A' && (c) <= 'Z') || \
4598 ((c) >= 'a' && (c) <= 'z') || \
4599 ((c) >= '0' && (c) <= '9') || \
4600 (c) == '+' || (c) == '/')
4601
4602/* given that c is a base-64 character, what is its base-64 value? */
4603
4604#define FROM_BASE64(c) \
4605 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4606 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4607 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4608 (c) == '+' ? 62 : 63)
4609
4610/* What is the base-64 character of the bottom 6 bits of n? */
4611
4612#define TO_BASE64(n) \
4613 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4614
4615/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4616 * decoded as itself. We are permissive on decoding; the only ASCII
4617 * byte not decoding to itself is the + which begins a base64
4618 * string. */
4619
4620#define DECODE_DIRECT(c) \
4621 ((c) <= 127 && (c) != '+')
4622
4623/* The UTF-7 encoder treats ASCII characters differently according to
4624 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4625 * the above). See RFC2152. This array identifies these different
4626 * sets:
4627 * 0 : "Set D"
4628 * alphanumeric and '(),-./:?
4629 * 1 : "Set O"
4630 * !"#$%&*;<=>@[]^_`{|}
4631 * 2 : "whitespace"
4632 * ht nl cr sp
4633 * 3 : special (must be base64 encoded)
4634 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4635 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636
Tim Petersced69f82003-09-16 20:30:58 +00004637static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638char utf7_category[128] = {
4639/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4640 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4641/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4642 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4643/* sp ! " # $ % & ' ( ) * + , - . / */
4644 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4645/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4647/* @ A B C D E F G H I J K L M N O */
4648 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4649/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4650 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4651/* ` a b c d e f g h i j k l m n o */
4652 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4653/* p q r s t u v w x y z { | } ~ del */
4654 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655};
4656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657/* ENCODE_DIRECT: this character should be encoded as itself. The
4658 * answer depends on whether we are encoding set O as itself, and also
4659 * on whether we are encoding whitespace as itself. RFC2152 makes it
4660 * clear that the answers to these questions vary between
4661 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004662
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663#define ENCODE_DIRECT(c, directO, directWS) \
4664 ((c) < 128 && (c) > 0 && \
4665 ((utf7_category[(c)] == 0) || \
4666 (directWS && (utf7_category[(c)] == 2)) || \
4667 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668
Alexander Belopolsky40018472011-02-26 01:02:56 +00004669PyObject *
4670PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004671 Py_ssize_t size,
4672 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004674 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4675}
4676
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677/* The decoder. The only state we preserve is our read position,
4678 * i.e. how many characters we have consumed. So if we end in the
4679 * middle of a shift sequence we have to back off the read position
4680 * and the output to the beginning of the sequence, otherwise we lose
4681 * all the shift state (seen bits, number of bits seen, high
4682 * surrogate). */
4683
Alexander Belopolsky40018472011-02-26 01:02:56 +00004684PyObject *
4685PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004686 Py_ssize_t size,
4687 const char *errors,
4688 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004691 Py_ssize_t startinpos;
4692 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004694 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 const char *errmsg = "";
4696 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004697 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698 unsigned int base64bits = 0;
4699 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004700 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 PyObject *errorHandler = NULL;
4702 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004704 if (size == 0) {
4705 if (consumed)
4706 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004707 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004711 _PyUnicodeWriter_Init(&writer);
4712 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004713
4714 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715 e = s + size;
4716
4717 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004718 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004720 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 if (inShift) { /* in a base-64 section */
4723 if (IS_BASE64(ch)) { /* consume a base-64 character */
4724 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4725 base64bits += 6;
4726 s++;
4727 if (base64bits >= 16) {
4728 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004729 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730 base64bits -= 16;
4731 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004732 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 if (surrogate) {
4734 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004735 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4736 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004737 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004738 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004739 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004740 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 }
4742 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004743 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004744 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 }
4747 }
Victor Stinner551ac952011-11-29 22:58:13 +01004748 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004749 /* first surrogate */
4750 surrogate = outCh;
4751 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004753 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004754 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004755 }
4756 }
4757 }
4758 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 if (base64bits > 0) { /* left-over bits */
4761 if (base64bits >= 6) {
4762 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004763 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004764 errmsg = "partial character in shift sequence";
4765 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 else {
4768 /* Some bits remain; they should be zero */
4769 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004770 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771 errmsg = "non-zero padding bits in shift sequence";
4772 goto utf7Error;
4773 }
4774 }
4775 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004776 if (surrogate && DECODE_DIRECT(ch)) {
4777 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4778 goto onError;
4779 }
4780 surrogate = 0;
4781 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 /* '-' is absorbed; other terminating
4783 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004784 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 }
4787 }
4788 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 s++; /* consume '+' */
4791 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004792 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004793 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004794 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004796 else if (s < e && !IS_BASE64(*s)) {
4797 s++;
4798 errmsg = "ill-formed sequence";
4799 goto utf7Error;
4800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004802 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004803 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004804 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004805 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004806 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004807 }
4808 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004810 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004811 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004812 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 else {
4815 startinpos = s-starts;
4816 s++;
4817 errmsg = "unexpected special character";
4818 goto utf7Error;
4819 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004820 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004823 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 errors, &errorHandler,
4825 "utf7", errmsg,
4826 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004827 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004829 }
4830
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 /* end of string */
4832
4833 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4834 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004835 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004836 if (surrogate ||
4837 (base64bits >= 6) ||
4838 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004839 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004840 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 errors, &errorHandler,
4842 "utf7", "unterminated shift sequence",
4843 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004844 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004845 goto onError;
4846 if (s < e)
4847 goto restart;
4848 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004849 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004850
4851 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004852 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004853 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004854 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004855 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004856 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004857 writer.kind, writer.data, shiftOutStart);
4858 Py_XDECREF(errorHandler);
4859 Py_XDECREF(exc);
4860 _PyUnicodeWriter_Dealloc(&writer);
4861 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004862 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004863 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004864 }
4865 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004866 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004867 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004868 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 Py_XDECREF(errorHandler);
4871 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004872 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004873
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004875 Py_XDECREF(errorHandler);
4876 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004877 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004878 return NULL;
4879}
4880
4881
Alexander Belopolsky40018472011-02-26 01:02:56 +00004882PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004883_PyUnicode_EncodeUTF7(PyObject *str,
4884 int base64SetO,
4885 int base64WhiteSpace,
4886 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004887{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004888 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004889 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004890 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004891 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004892 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004893 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004894 unsigned int base64bits = 0;
4895 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004896 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004897 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004898
Benjamin Petersonbac79492012-01-14 13:34:47 -05004899 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004900 return NULL;
4901 kind = PyUnicode_KIND(str);
4902 data = PyUnicode_DATA(str);
4903 len = PyUnicode_GET_LENGTH(str);
4904
4905 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004907
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004908 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004909 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004910 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004911 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004912 if (v == NULL)
4913 return NULL;
4914
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004915 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004916 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004917 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004918
Antoine Pitrou244651a2009-05-04 18:56:13 +00004919 if (inShift) {
4920 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4921 /* shifting out */
4922 if (base64bits) { /* output remaining bits */
4923 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4924 base64buffer = 0;
4925 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004926 }
4927 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004928 /* Characters not in the BASE64 set implicitly unshift the sequence
4929 so no '-' is required, except if the character is itself a '-' */
4930 if (IS_BASE64(ch) || ch == '-') {
4931 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004932 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004933 *out++ = (char) ch;
4934 }
4935 else {
4936 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004937 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004938 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004939 else { /* not in a shift sequence */
4940 if (ch == '+') {
4941 *out++ = '+';
4942 *out++ = '-';
4943 }
4944 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4945 *out++ = (char) ch;
4946 }
4947 else {
4948 *out++ = '+';
4949 inShift = 1;
4950 goto encode_char;
4951 }
4952 }
4953 continue;
4954encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004955 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004956 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004957
Antoine Pitrou244651a2009-05-04 18:56:13 +00004958 /* code first surrogate */
4959 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004960 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004961 while (base64bits >= 6) {
4962 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4963 base64bits -= 6;
4964 }
4965 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004966 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004967 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004968 base64bits += 16;
4969 base64buffer = (base64buffer << 16) | ch;
4970 while (base64bits >= 6) {
4971 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4972 base64bits -= 6;
4973 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004974 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004975 if (base64bits)
4976 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4977 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004978 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004979 if (_PyBytes_Resize(&v, out - start) < 0)
4980 return NULL;
4981 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004982}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004983PyObject *
4984PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4985 Py_ssize_t size,
4986 int base64SetO,
4987 int base64WhiteSpace,
4988 const char *errors)
4989{
4990 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004991 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004992 if (tmp == NULL)
4993 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004994 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004995 base64WhiteSpace, errors);
4996 Py_DECREF(tmp);
4997 return result;
4998}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004999
Antoine Pitrou244651a2009-05-04 18:56:13 +00005000#undef IS_BASE64
5001#undef FROM_BASE64
5002#undef TO_BASE64
5003#undef DECODE_DIRECT
5004#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005005
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006/* --- UTF-8 Codec -------------------------------------------------------- */
5007
Alexander Belopolsky40018472011-02-26 01:02:56 +00005008PyObject *
5009PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005010 Py_ssize_t size,
5011 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012{
Walter Dörwald69652032004-09-07 20:24:22 +00005013 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5014}
5015
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016#include "stringlib/asciilib.h"
5017#include "stringlib/codecs.h"
5018#include "stringlib/undef.h"
5019
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005020#include "stringlib/ucs1lib.h"
5021#include "stringlib/codecs.h"
5022#include "stringlib/undef.h"
5023
5024#include "stringlib/ucs2lib.h"
5025#include "stringlib/codecs.h"
5026#include "stringlib/undef.h"
5027
5028#include "stringlib/ucs4lib.h"
5029#include "stringlib/codecs.h"
5030#include "stringlib/undef.h"
5031
Ma Lina0c603c2020-10-18 22:48:38 +08005032/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005033 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005034#if (SIZEOF_SIZE_T == 8)
5035# define ASCII_CHAR_MASK 0x8080808080808080ULL
5036#elif (SIZEOF_SIZE_T == 4)
5037# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005038#else
Ma Lina0c603c2020-10-18 22:48:38 +08005039# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005040#endif
5041
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005042static Py_ssize_t
5043ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005044{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005045 const char *p = start;
Ma Lina0c603c2020-10-18 22:48:38 +08005046 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_SIZE_T);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005048 /*
5049 * Issue #17237: m68k is a bit different from most architectures in
5050 * that objects do not use "natural alignment" - for example, int and
5051 * long are only aligned at 2-byte boundaries. Therefore the assert()
5052 * won't work; also, tests have shown that skipping the "optimised
5053 * version" will even speed up m68k.
5054 */
5055#if !defined(__m68k__)
Ma Lina0c603c2020-10-18 22:48:38 +08005056#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5057 assert(_Py_IS_ALIGNED(dest, SIZEOF_SIZE_T));
5058 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 /* Fast path, see in STRINGLIB(utf8_decode) for
5060 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005061 /* Help allocation */
5062 const char *_p = p;
5063 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005065 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005066 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005068 *((size_t *)q) = value;
5069 _p += SIZEOF_SIZE_T;
5070 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005071 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 p = _p;
5073 while (p < end) {
5074 if ((unsigned char)*p & 0x80)
5075 break;
5076 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005081#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082 while (p < end) {
5083 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5084 for an explanation. */
Ma Lina0c603c2020-10-18 22:48:38 +08005085 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005086 /* Help allocation */
5087 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005088 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005089 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 if (value & ASCII_CHAR_MASK)
5091 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005092 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 }
5094 p = _p;
5095 if (_p == end)
5096 break;
5097 }
5098 if ((unsigned char)*p & 0x80)
5099 break;
5100 ++p;
5101 }
5102 memcpy(dest, start, p - start);
5103 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104}
Antoine Pitrouab868312009-01-10 15:40:25 +00005105
Victor Stinner709d23d2019-05-02 14:56:30 -04005106static PyObject *
5107unicode_decode_utf8(const char *s, Py_ssize_t size,
5108 _Py_error_handler error_handler, const char *errors,
5109 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005110{
Victor Stinner785938e2011-12-11 20:09:03 +01005111 if (size == 0) {
5112 if (consumed)
5113 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005114 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005115 }
5116
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5118 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005119 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005121 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005122 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005123 }
5124
Inada Naoki770847a2019-06-24 12:30:24 +09005125 const char *starts = s;
5126 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005127
Inada Naoki770847a2019-06-24 12:30:24 +09005128 // fast path: try ASCII string.
5129 PyObject *u = PyUnicode_New(size, 127);
5130 if (u == NULL) {
5131 return NULL;
5132 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005133 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005134 if (s == end) {
5135 return u;
5136 }
5137
5138 // Use _PyUnicodeWriter after fast path is failed.
5139 _PyUnicodeWriter writer;
5140 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5141 writer.pos = s - starts;
5142
5143 Py_ssize_t startinpos, endinpos;
5144 const char *errmsg = "";
5145 PyObject *error_handler_obj = NULL;
5146 PyObject *exc = NULL;
5147
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 while (s < end) {
5149 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005150 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005151
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005152 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005153 if (PyUnicode_IS_ASCII(writer.buffer))
5154 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005156 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005157 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005158 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005159 } else {
5160 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005161 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005162 }
5163
5164 switch (ch) {
5165 case 0:
5166 if (s == end || consumed)
5167 goto End;
5168 errmsg = "unexpected end of data";
5169 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005170 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005171 break;
5172 case 1:
5173 errmsg = "invalid start byte";
5174 startinpos = s - starts;
5175 endinpos = startinpos + 1;
5176 break;
5177 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005178 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5179 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5180 {
5181 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005182 goto End;
5183 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005184 /* fall through */
5185 case 3:
5186 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005187 errmsg = "invalid continuation byte";
5188 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005189 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005190 break;
5191 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005192 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005193 goto onError;
5194 continue;
5195 }
5196
Victor Stinner1d65d912015-10-05 13:43:50 +02005197 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005198 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005199
5200 switch (error_handler) {
5201 case _Py_ERROR_IGNORE:
5202 s += (endinpos - startinpos);
5203 break;
5204
5205 case _Py_ERROR_REPLACE:
5206 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5207 goto onError;
5208 s += (endinpos - startinpos);
5209 break;
5210
5211 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005212 {
5213 Py_ssize_t i;
5214
Victor Stinner1d65d912015-10-05 13:43:50 +02005215 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5216 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005217 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005218 ch = (Py_UCS4)(unsigned char)(starts[i]);
5219 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5220 ch + 0xdc00);
5221 writer.pos++;
5222 }
5223 s += (endinpos - startinpos);
5224 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005225 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005226
5227 default:
5228 if (unicode_decode_call_errorhandler_writer(
5229 errors, &error_handler_obj,
5230 "utf-8", errmsg,
5231 &starts, &end, &startinpos, &endinpos, &exc, &s,
5232 &writer))
5233 goto onError;
5234 }
Victor Stinner785938e2011-12-11 20:09:03 +01005235 }
5236
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005237End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005238 if (consumed)
5239 *consumed = s - starts;
5240
Victor Stinner1d65d912015-10-05 13:43:50 +02005241 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005242 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005243 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005244
5245onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005246 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005247 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005248 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005249 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005250}
5251
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005252
Victor Stinner709d23d2019-05-02 14:56:30 -04005253PyObject *
5254PyUnicode_DecodeUTF8Stateful(const char *s,
5255 Py_ssize_t size,
5256 const char *errors,
5257 Py_ssize_t *consumed)
5258{
5259 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5260}
5261
5262
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005263/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5264 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005265
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005266 On success, write a pointer to a newly allocated wide character string into
5267 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5268 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005269
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 On memory allocation failure, return -1.
5271
5272 On decoding error (if surrogateescape is zero), return -2. If wlen is
5273 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5274 is not NULL, write the decoding error message into *reason. */
5275int
5276_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005277 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005278{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005279 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005280 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005281 wchar_t *unicode;
5282 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005283
Victor Stinner3d4226a2018-08-29 22:21:32 +02005284 int surrogateescape = 0;
5285 int surrogatepass = 0;
5286 switch (errors)
5287 {
5288 case _Py_ERROR_STRICT:
5289 break;
5290 case _Py_ERROR_SURROGATEESCAPE:
5291 surrogateescape = 1;
5292 break;
5293 case _Py_ERROR_SURROGATEPASS:
5294 surrogatepass = 1;
5295 break;
5296 default:
5297 return -3;
5298 }
5299
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005300 /* Note: size will always be longer than the resulting Unicode
5301 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005302 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005303 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005304 }
5305
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005306 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005307 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005308 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005309 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005310
5311 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005312 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005313 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005314 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005315 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005316#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005317 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005318#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005319 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005320#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005321 if (ch > 0xFF) {
5322#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005323 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005324#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005325 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005326 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005327 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5328 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5329#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005330 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005331 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005332 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005333 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005334 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005335
5336 if (surrogateescape) {
5337 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5338 }
5339 else {
5340 /* Is it a valid three-byte code? */
5341 if (surrogatepass
5342 && (e - s) >= 3
5343 && (s[0] & 0xf0) == 0xe0
5344 && (s[1] & 0xc0) == 0x80
5345 && (s[2] & 0xc0) == 0x80)
5346 {
5347 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5348 s += 3;
5349 unicode[outpos++] = ch;
5350 }
5351 else {
5352 PyMem_RawFree(unicode );
5353 if (reason != NULL) {
5354 switch (ch) {
5355 case 0:
5356 *reason = "unexpected end of data";
5357 break;
5358 case 1:
5359 *reason = "invalid start byte";
5360 break;
5361 /* 2, 3, 4 */
5362 default:
5363 *reason = "invalid continuation byte";
5364 break;
5365 }
5366 }
5367 if (wlen != NULL) {
5368 *wlen = s - orig_s;
5369 }
5370 return -2;
5371 }
5372 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005373 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005374 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005375 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005376 if (wlen) {
5377 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005378 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005379 *wstr = unicode;
5380 return 0;
5381}
5382
Victor Stinner5f9cf232019-03-19 01:46:25 +01005383
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005384wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005385_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5386 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005387{
5388 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005389 int res = _Py_DecodeUTF8Ex(arg, arglen,
5390 &wstr, wlen,
5391 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005392 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005393 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5394 assert(res != -3);
5395 if (wlen) {
5396 *wlen = (size_t)res;
5397 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005398 return NULL;
5399 }
5400 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005401}
5402
Antoine Pitrouab868312009-01-10 15:40:25 +00005403
Victor Stinnere47e6982017-12-21 15:45:16 +01005404/* UTF-8 encoder using the surrogateescape error handler .
5405
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005406 On success, return 0 and write the newly allocated character string (use
5407 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005408
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005409 On encoding failure, return -2 and write the position of the invalid
5410 surrogate character into *error_pos (if error_pos is set) and the decoding
5411 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005412
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005413 On memory allocation failure, return -1. */
5414int
5415_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005416 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005417{
5418 const Py_ssize_t max_char_size = 4;
5419 Py_ssize_t len = wcslen(text);
5420
5421 assert(len >= 0);
5422
Victor Stinner3d4226a2018-08-29 22:21:32 +02005423 int surrogateescape = 0;
5424 int surrogatepass = 0;
5425 switch (errors)
5426 {
5427 case _Py_ERROR_STRICT:
5428 break;
5429 case _Py_ERROR_SURROGATEESCAPE:
5430 surrogateescape = 1;
5431 break;
5432 case _Py_ERROR_SURROGATEPASS:
5433 surrogatepass = 1;
5434 break;
5435 default:
5436 return -3;
5437 }
5438
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005439 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5440 return -1;
5441 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005442 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005443 if (raw_malloc) {
5444 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005445 }
5446 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005447 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005448 }
5449 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005450 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005451 }
5452
5453 char *p = bytes;
5454 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005455 for (i = 0; i < len; ) {
5456 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005457 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005458 i++;
5459#if Py_UNICODE_SIZE == 2
5460 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5461 && i < len
5462 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5463 {
5464 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5465 i++;
5466 }
5467#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005468
5469 if (ch < 0x80) {
5470 /* Encode ASCII */
5471 *p++ = (char) ch;
5472
5473 }
5474 else if (ch < 0x0800) {
5475 /* Encode Latin-1 */
5476 *p++ = (char)(0xc0 | (ch >> 6));
5477 *p++ = (char)(0x80 | (ch & 0x3f));
5478 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005479 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005480 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005481 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005482 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005483 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005484 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005485 if (reason != NULL) {
5486 *reason = "encoding error";
5487 }
5488 if (raw_malloc) {
5489 PyMem_RawFree(bytes);
5490 }
5491 else {
5492 PyMem_Free(bytes);
5493 }
5494 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005495 }
5496 *p++ = (char)(ch & 0xff);
5497 }
5498 else if (ch < 0x10000) {
5499 *p++ = (char)(0xe0 | (ch >> 12));
5500 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5501 *p++ = (char)(0x80 | (ch & 0x3f));
5502 }
5503 else { /* ch >= 0x10000 */
5504 assert(ch <= MAX_UNICODE);
5505 /* Encode UCS4 Unicode ordinals */
5506 *p++ = (char)(0xf0 | (ch >> 18));
5507 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5508 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5509 *p++ = (char)(0x80 | (ch & 0x3f));
5510 }
5511 }
5512 *p++ = '\0';
5513
5514 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005515 char *bytes2;
5516 if (raw_malloc) {
5517 bytes2 = PyMem_RawRealloc(bytes, final_size);
5518 }
5519 else {
5520 bytes2 = PyMem_Realloc(bytes, final_size);
5521 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005522 if (bytes2 == NULL) {
5523 if (error_pos != NULL) {
5524 *error_pos = (size_t)-1;
5525 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005526 if (raw_malloc) {
5527 PyMem_RawFree(bytes);
5528 }
5529 else {
5530 PyMem_Free(bytes);
5531 }
5532 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005533 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005534 *str = bytes2;
5535 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005536}
5537
5538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005539/* Primary internal function which creates utf8 encoded bytes objects.
5540
5541 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005542 and allocate exactly as much space needed at the end. Else allocate the
5543 maximum possible needed (4 result bytes per Unicode character), and return
5544 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005545*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005546static PyObject *
5547unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5548 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 if (!PyUnicode_Check(unicode)) {
5551 PyErr_BadArgument();
5552 return NULL;
5553 }
5554
5555 if (PyUnicode_READY(unicode) == -1)
5556 return NULL;
5557
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005558 if (PyUnicode_UTF8(unicode))
5559 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5560 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561
Inada Naoki02a4d572020-02-27 13:48:59 +09005562 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005563 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005564 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5565
5566 _PyBytesWriter writer;
5567 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005568
Benjamin Petersonead6b532011-12-20 17:23:42 -06005569 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005570 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005571 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005572 case PyUnicode_1BYTE_KIND:
5573 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5574 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005575 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5576 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005577 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005578 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5579 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005580 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005581 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5582 break;
Tim Peters602f7402002-04-27 18:03:26 +00005583 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005584
5585 if (end == NULL) {
5586 _PyBytesWriter_Dealloc(&writer);
5587 return NULL;
5588 }
5589 return _PyBytesWriter_Finish(&writer, end);
5590}
5591
5592static int
5593unicode_fill_utf8(PyObject *unicode)
5594{
5595 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5596 assert(!PyUnicode_IS_ASCII(unicode));
5597
5598 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005599 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005600 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5601
5602 _PyBytesWriter writer;
5603 char *end;
5604
5605 switch (kind) {
5606 default:
5607 Py_UNREACHABLE();
5608 case PyUnicode_1BYTE_KIND:
5609 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5610 _Py_ERROR_STRICT, NULL);
5611 break;
5612 case PyUnicode_2BYTE_KIND:
5613 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5614 _Py_ERROR_STRICT, NULL);
5615 break;
5616 case PyUnicode_4BYTE_KIND:
5617 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5618 _Py_ERROR_STRICT, NULL);
5619 break;
5620 }
5621 if (end == NULL) {
5622 _PyBytesWriter_Dealloc(&writer);
5623 return -1;
5624 }
5625
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005626 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005627 PyBytes_AS_STRING(writer.buffer);
5628 Py_ssize_t len = end - start;
5629
5630 char *cache = PyObject_MALLOC(len + 1);
5631 if (cache == NULL) {
5632 _PyBytesWriter_Dealloc(&writer);
5633 PyErr_NoMemory();
5634 return -1;
5635 }
5636 _PyUnicode_UTF8(unicode) = cache;
5637 _PyUnicode_UTF8_LENGTH(unicode) = len;
5638 memcpy(cache, start, len);
5639 cache[len] = '\0';
5640 _PyBytesWriter_Dealloc(&writer);
5641 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642}
5643
Alexander Belopolsky40018472011-02-26 01:02:56 +00005644PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005645_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5646{
5647 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5648}
5649
5650
5651PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005652PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5653 Py_ssize_t size,
5654 const char *errors)
5655{
5656 PyObject *v, *unicode;
5657
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005658 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 if (unicode == NULL)
5660 return NULL;
5661 v = _PyUnicode_AsUTF8String(unicode, errors);
5662 Py_DECREF(unicode);
5663 return v;
5664}
5665
5666PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005667PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670}
5671
Walter Dörwald41980ca2007-08-16 21:55:45 +00005672/* --- UTF-32 Codec ------------------------------------------------------- */
5673
5674PyObject *
5675PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 Py_ssize_t size,
5677 const char *errors,
5678 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005679{
5680 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5681}
5682
5683PyObject *
5684PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 Py_ssize_t size,
5686 const char *errors,
5687 int *byteorder,
5688 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005689{
5690 const char *starts = s;
5691 Py_ssize_t startinpos;
5692 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005693 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005694 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005695 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005696 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005697 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698 PyObject *errorHandler = NULL;
5699 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005700
Andy Lestere6be9b52020-02-11 20:28:35 -06005701 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005702 e = q + size;
5703
5704 if (byteorder)
5705 bo = *byteorder;
5706
5707 /* Check for BOM marks (U+FEFF) in the input and adjust current
5708 byte order setting accordingly. In native mode, the leading BOM
5709 mark is skipped, in all other modes, it is copied to the output
5710 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005711 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005712 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005713 if (bom == 0x0000FEFF) {
5714 bo = -1;
5715 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005717 else if (bom == 0xFFFE0000) {
5718 bo = 1;
5719 q += 4;
5720 }
5721 if (byteorder)
5722 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005723 }
5724
Victor Stinnere64322e2012-10-30 23:12:47 +01005725 if (q == e) {
5726 if (consumed)
5727 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005728 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005729 }
5730
Victor Stinnere64322e2012-10-30 23:12:47 +01005731#ifdef WORDS_BIGENDIAN
5732 le = bo < 0;
5733#else
5734 le = bo <= 0;
5735#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005737
Victor Stinner8f674cc2013-04-17 23:02:17 +02005738 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005739 writer.min_length = (e - q + 3) / 4;
5740 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005741 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005742
Victor Stinnere64322e2012-10-30 23:12:47 +01005743 while (1) {
5744 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005745 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005746
Victor Stinnere64322e2012-10-30 23:12:47 +01005747 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005748 enum PyUnicode_Kind kind = writer.kind;
5749 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005750 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005751 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005752 if (le) {
5753 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005754 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005755 if (ch > maxch)
5756 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005757 if (kind != PyUnicode_1BYTE_KIND &&
5758 Py_UNICODE_IS_SURROGATE(ch))
5759 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005760 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005761 q += 4;
5762 } while (q <= last);
5763 }
5764 else {
5765 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005766 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005767 if (ch > maxch)
5768 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 if (kind != PyUnicode_1BYTE_KIND &&
5770 Py_UNICODE_IS_SURROGATE(ch))
5771 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005773 q += 4;
5774 } while (q <= last);
5775 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005776 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005777 }
5778
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005780 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 startinpos = ((const char *)q) - starts;
5782 endinpos = startinpos + 4;
5783 }
5784 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005785 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005787 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005789 startinpos = ((const char *)q) - starts;
5790 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005792 else {
5793 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005794 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005795 goto onError;
5796 q += 4;
5797 continue;
5798 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005799 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005800 startinpos = ((const char *)q) - starts;
5801 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005803
5804 /* The remaining input chars are ignored if the callback
5805 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005806 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005810 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005812 }
5813
Walter Dörwald41980ca2007-08-16 21:55:45 +00005814 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005816
Walter Dörwald41980ca2007-08-16 21:55:45 +00005817 Py_XDECREF(errorHandler);
5818 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005819 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005820
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005822 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005823 Py_XDECREF(errorHandler);
5824 Py_XDECREF(exc);
5825 return NULL;
5826}
5827
5828PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005829_PyUnicode_EncodeUTF32(PyObject *str,
5830 const char *errors,
5831 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005832{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005833 enum PyUnicode_Kind kind;
5834 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005835 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005836 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005837 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005838#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005839 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005840#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005841 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005842#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005844 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005845 PyObject *errorHandler = NULL;
5846 PyObject *exc = NULL;
5847 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005848
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005849 if (!PyUnicode_Check(str)) {
5850 PyErr_BadArgument();
5851 return NULL;
5852 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005853 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005854 return NULL;
5855 kind = PyUnicode_KIND(str);
5856 data = PyUnicode_DATA(str);
5857 len = PyUnicode_GET_LENGTH(str);
5858
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005859 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005860 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005861 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005862 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005863 if (v == NULL)
5864 return NULL;
5865
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005866 /* output buffer is 4-bytes aligned */
5867 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005868 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005869 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005870 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005872 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005873
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005874 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005875 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005876 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005877 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005878 else
5879 encoding = "utf-32";
5880
5881 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005882 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5883 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005884 }
5885
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005886 pos = 0;
5887 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005888 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005889
5890 if (kind == PyUnicode_2BYTE_KIND) {
5891 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5892 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005893 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005894 else {
5895 assert(kind == PyUnicode_4BYTE_KIND);
5896 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5897 &out, native_ordering);
5898 }
5899 if (pos == len)
5900 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005901
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005902 rep = unicode_encode_call_errorhandler(
5903 errors, &errorHandler,
5904 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005905 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005906 if (!rep)
5907 goto error;
5908
5909 if (PyBytes_Check(rep)) {
5910 repsize = PyBytes_GET_SIZE(rep);
5911 if (repsize & 3) {
5912 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005913 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005914 "surrogates not allowed");
5915 goto error;
5916 }
5917 moreunits = repsize / 4;
5918 }
5919 else {
5920 assert(PyUnicode_Check(rep));
5921 if (PyUnicode_READY(rep) < 0)
5922 goto error;
5923 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5924 if (!PyUnicode_IS_ASCII(rep)) {
5925 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005926 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005927 "surrogates not allowed");
5928 goto error;
5929 }
5930 }
5931
5932 /* four bytes are reserved for each surrogate */
5933 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005934 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005935 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005936 /* integer overflow */
5937 PyErr_NoMemory();
5938 goto error;
5939 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005940 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005941 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005942 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005943 }
5944
5945 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005946 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005947 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005948 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005949 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005950 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5951 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005952 }
5953
5954 Py_CLEAR(rep);
5955 }
5956
5957 /* Cut back to size actually needed. This is necessary for, for example,
5958 encoding of a string containing isolated surrogates and the 'ignore'
5959 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005960 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005961 if (nsize != PyBytes_GET_SIZE(v))
5962 _PyBytes_Resize(&v, nsize);
5963 Py_XDECREF(errorHandler);
5964 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005965 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005966 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005967 error:
5968 Py_XDECREF(rep);
5969 Py_XDECREF(errorHandler);
5970 Py_XDECREF(exc);
5971 Py_XDECREF(v);
5972 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005973}
5974
Alexander Belopolsky40018472011-02-26 01:02:56 +00005975PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005976PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5977 Py_ssize_t size,
5978 const char *errors,
5979 int byteorder)
5980{
5981 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005982 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005983 if (tmp == NULL)
5984 return NULL;
5985 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5986 Py_DECREF(tmp);
5987 return result;
5988}
5989
5990PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005991PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005992{
Victor Stinnerb960b342011-11-20 19:12:52 +01005993 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005994}
5995
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996/* --- UTF-16 Codec ------------------------------------------------------- */
5997
Tim Peters772747b2001-08-09 22:21:55 +00005998PyObject *
5999PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 Py_ssize_t size,
6001 const char *errors,
6002 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003{
Walter Dörwald69652032004-09-07 20:24:22 +00006004 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6005}
6006
6007PyObject *
6008PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 Py_ssize_t size,
6010 const char *errors,
6011 int *byteorder,
6012 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006013{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006015 Py_ssize_t startinpos;
6016 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006017 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006018 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006019 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006020 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006021 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 PyObject *errorHandler = NULL;
6023 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006024 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Andy Lestere6be9b52020-02-11 20:28:35 -06006026 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006027 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
6029 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006030 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006032 /* Check for BOM marks (U+FEFF) in the input and adjust current
6033 byte order setting accordingly. In native mode, the leading BOM
6034 mark is skipped, in all other modes, it is copied to the output
6035 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006036 if (bo == 0 && size >= 2) {
6037 const Py_UCS4 bom = (q[1] << 8) | q[0];
6038 if (bom == 0xFEFF) {
6039 q += 2;
6040 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006042 else if (bom == 0xFFFE) {
6043 q += 2;
6044 bo = 1;
6045 }
6046 if (byteorder)
6047 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049
Antoine Pitrou63065d72012-05-15 23:48:04 +02006050 if (q == e) {
6051 if (consumed)
6052 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006053 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006054 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006055
Christian Heimes743e0cd2012-10-17 23:52:17 +02006056#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006057 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006058 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006059#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006060 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006061 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006062#endif
Tim Peters772747b2001-08-09 22:21:55 +00006063
Antoine Pitrou63065d72012-05-15 23:48:04 +02006064 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006065 character count normally. Error handler will take care of
6066 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006067 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006068 writer.min_length = (e - q + 1) / 2;
6069 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006071
Antoine Pitrou63065d72012-05-15 23:48:04 +02006072 while (1) {
6073 Py_UCS4 ch = 0;
6074 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006075 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006076 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006077 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006078 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006080 native_ordering);
6081 else
6082 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006083 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 native_ordering);
6085 } else if (kind == PyUnicode_2BYTE_KIND) {
6086 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006087 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006088 native_ordering);
6089 } else {
6090 assert(kind == PyUnicode_4BYTE_KIND);
6091 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006092 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006094 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096
Antoine Pitrou63065d72012-05-15 23:48:04 +02006097 switch (ch)
6098 {
6099 case 0:
6100 /* remaining byte at the end? (size should be even) */
6101 if (q == e || consumed)
6102 goto End;
6103 errmsg = "truncated data";
6104 startinpos = ((const char *)q) - starts;
6105 endinpos = ((const char *)e) - starts;
6106 break;
6107 /* The remaining input chars are ignored if the callback
6108 chooses to skip the input */
6109 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006110 q -= 2;
6111 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006112 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006113 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006114 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006115 endinpos = ((const char *)e) - starts;
6116 break;
6117 case 2:
6118 errmsg = "illegal encoding";
6119 startinpos = ((const char *)q) - 2 - starts;
6120 endinpos = startinpos + 2;
6121 break;
6122 case 3:
6123 errmsg = "illegal UTF-16 surrogate";
6124 startinpos = ((const char *)q) - 4 - starts;
6125 endinpos = startinpos + 2;
6126 break;
6127 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006128 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006129 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 continue;
6131 }
6132
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006133 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006134 errors,
6135 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006136 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006137 &starts,
6138 (const char **)&e,
6139 &startinpos,
6140 &endinpos,
6141 &exc,
6142 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006143 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 }
6146
Antoine Pitrou63065d72012-05-15 23:48:04 +02006147End:
Walter Dörwald69652032004-09-07 20:24:22 +00006148 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151 Py_XDECREF(errorHandler);
6152 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006153 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006156 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006157 Py_XDECREF(errorHandler);
6158 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 return NULL;
6160}
6161
Tim Peters772747b2001-08-09 22:21:55 +00006162PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163_PyUnicode_EncodeUTF16(PyObject *str,
6164 const char *errors,
6165 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006167 enum PyUnicode_Kind kind;
6168 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006170 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006171 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006172 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006173#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006174 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006175#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006176 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006177#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006178 const char *encoding;
6179 Py_ssize_t nsize, pos;
6180 PyObject *errorHandler = NULL;
6181 PyObject *exc = NULL;
6182 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006183
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 if (!PyUnicode_Check(str)) {
6185 PyErr_BadArgument();
6186 return NULL;
6187 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006188 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 return NULL;
6190 kind = PyUnicode_KIND(str);
6191 data = PyUnicode_DATA(str);
6192 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006193
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006195 if (kind == PyUnicode_4BYTE_KIND) {
6196 const Py_UCS4 *in = (const Py_UCS4 *)data;
6197 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006198 while (in < end) {
6199 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006200 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006201 }
6202 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006203 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006204 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006206 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006207 nsize = len + pairs + (byteorder == 0);
6208 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006209 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006213 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006214 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006215 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006216 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006217 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006218 }
6219 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006220 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006221 }
Tim Peters772747b2001-08-09 22:21:55 +00006222
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006223 if (kind == PyUnicode_1BYTE_KIND) {
6224 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6225 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006226 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006227
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006228 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006229 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006230 }
6231 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006232 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006233 }
6234 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006235 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006236 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006237
6238 pos = 0;
6239 while (pos < len) {
6240 Py_ssize_t repsize, moreunits;
6241
6242 if (kind == PyUnicode_2BYTE_KIND) {
6243 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6244 &out, native_ordering);
6245 }
6246 else {
6247 assert(kind == PyUnicode_4BYTE_KIND);
6248 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6249 &out, native_ordering);
6250 }
6251 if (pos == len)
6252 break;
6253
6254 rep = unicode_encode_call_errorhandler(
6255 errors, &errorHandler,
6256 encoding, "surrogates not allowed",
6257 str, &exc, pos, pos + 1, &pos);
6258 if (!rep)
6259 goto error;
6260
6261 if (PyBytes_Check(rep)) {
6262 repsize = PyBytes_GET_SIZE(rep);
6263 if (repsize & 1) {
6264 raise_encode_exception(&exc, encoding,
6265 str, pos - 1, pos,
6266 "surrogates not allowed");
6267 goto error;
6268 }
6269 moreunits = repsize / 2;
6270 }
6271 else {
6272 assert(PyUnicode_Check(rep));
6273 if (PyUnicode_READY(rep) < 0)
6274 goto error;
6275 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6276 if (!PyUnicode_IS_ASCII(rep)) {
6277 raise_encode_exception(&exc, encoding,
6278 str, pos - 1, pos,
6279 "surrogates not allowed");
6280 goto error;
6281 }
6282 }
6283
6284 /* two bytes are reserved for each surrogate */
6285 if (moreunits > 1) {
6286 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006287 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006288 /* integer overflow */
6289 PyErr_NoMemory();
6290 goto error;
6291 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006292 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006293 goto error;
6294 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6295 }
6296
6297 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006298 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006299 out += moreunits;
6300 } else /* rep is unicode */ {
6301 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6302 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6303 &out, native_ordering);
6304 }
6305
6306 Py_CLEAR(rep);
6307 }
6308
6309 /* Cut back to size actually needed. This is necessary for, for example,
6310 encoding of a string containing isolated surrogates and the 'ignore' handler
6311 is used. */
6312 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6313 if (nsize != PyBytes_GET_SIZE(v))
6314 _PyBytes_Resize(&v, nsize);
6315 Py_XDECREF(errorHandler);
6316 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006317 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006318 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006319 error:
6320 Py_XDECREF(rep);
6321 Py_XDECREF(errorHandler);
6322 Py_XDECREF(exc);
6323 Py_XDECREF(v);
6324 return NULL;
6325#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326}
6327
Alexander Belopolsky40018472011-02-26 01:02:56 +00006328PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006329PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6330 Py_ssize_t size,
6331 const char *errors,
6332 int byteorder)
6333{
6334 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006335 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006336 if (tmp == NULL)
6337 return NULL;
6338 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6339 Py_DECREF(tmp);
6340 return result;
6341}
6342
6343PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006344PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006346 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347}
6348
6349/* --- Unicode Escape Codec ----------------------------------------------- */
6350
Victor Stinner47e1afd2020-10-26 16:43:47 +01006351static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006352
Alexander Belopolsky40018472011-02-26 01:02:56 +00006353PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006354_PyUnicode_DecodeUnicodeEscape(const char *s,
6355 Py_ssize_t size,
6356 const char *errors,
6357 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006360 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 PyObject *errorHandler = NULL;
6363 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006364
Eric V. Smith42454af2016-10-31 09:22:08 -04006365 // so we can remember if we've seen an invalid escape char or not
6366 *first_invalid_escape = NULL;
6367
Victor Stinner62ec3312016-09-06 17:04:34 -07006368 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006369 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006370 }
6371 /* Escaped strings will always be longer than the resulting
6372 Unicode string, so we start with size here and then reduce the
6373 length after conversion to the true value.
6374 (but if the error callback returns a long replacement string
6375 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006376 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 writer.min_length = size;
6378 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6379 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006380 }
6381
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 end = s + size;
6383 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006384 unsigned char c = (unsigned char) *s++;
6385 Py_UCS4 ch;
6386 int count;
6387 Py_ssize_t startinpos;
6388 Py_ssize_t endinpos;
6389 const char *message;
6390
6391#define WRITE_ASCII_CHAR(ch) \
6392 do { \
6393 assert(ch <= 127); \
6394 assert(writer.pos < writer.size); \
6395 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6396 } while(0)
6397
6398#define WRITE_CHAR(ch) \
6399 do { \
6400 if (ch <= writer.maxchar) { \
6401 assert(writer.pos < writer.size); \
6402 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6403 } \
6404 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6405 goto onError; \
6406 } \
6407 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408
6409 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 if (c != '\\') {
6411 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 continue;
6413 }
6414
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 if (s >= end) {
6418 message = "\\ at end of string";
6419 goto error;
6420 }
6421 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006422
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006424 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 case '\n': continue;
6428 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6429 case '\'': WRITE_ASCII_CHAR('\''); continue;
6430 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6431 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006432 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6434 case 't': WRITE_ASCII_CHAR('\t'); continue;
6435 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6436 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006437 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006439 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006440 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 case '0': case '1': case '2': case '3':
6444 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006445 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006446 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 ch = (ch<<3) + *s++ - '0';
6448 if (s < end && '0' <= *s && *s <= '7') {
6449 ch = (ch<<3) + *s++ - '0';
6450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 WRITE_CHAR(ch);
6453 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454
Benjamin Peterson29060642009-01-31 22:14:21 +00006455 /* hex escapes */
6456 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006459 message = "truncated \\xXX escape";
6460 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006465 message = "truncated \\uXXXX escape";
6466 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006469 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006471 message = "truncated \\UXXXXXXXX escape";
6472 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006473 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006474 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 ch <<= 4;
6476 if (c >= '0' && c <= '9') {
6477 ch += c - '0';
6478 }
6479 else if (c >= 'a' && c <= 'f') {
6480 ch += c - ('a' - 10);
6481 }
6482 else if (c >= 'A' && c <= 'F') {
6483 ch += c - ('A' - 10);
6484 }
6485 else {
6486 break;
6487 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006488 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006490 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006491 }
6492
6493 /* when we get here, ch is a 32-bit unicode character */
6494 if (ch > MAX_UNICODE) {
6495 message = "illegal Unicode character";
6496 goto error;
6497 }
6498
6499 WRITE_CHAR(ch);
6500 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006501
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006503 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006504 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006505 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006506 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006507 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006508 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006509 PyErr_SetString(
6510 PyExc_UnicodeError,
6511 "\\N escapes not supported (can't load unicodedata module)"
6512 );
6513 goto onError;
6514 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006515 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006516
6517 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006518 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 const char *start = ++s;
6520 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006521 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006523 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006524 namelen = s - start;
6525 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006526 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006527 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006528 ch = 0xffffffff; /* in case 'getcode' messes up */
6529 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006530 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 &ch, 0)) {
6532 assert(ch <= MAX_UNICODE);
6533 WRITE_CHAR(ch);
6534 continue;
6535 }
6536 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006537 }
6538 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006539 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006540
6541 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006542 if (*first_invalid_escape == NULL) {
6543 *first_invalid_escape = s-1; /* Back up one char, since we've
6544 already incremented s. */
6545 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006546 WRITE_ASCII_CHAR('\\');
6547 WRITE_CHAR(c);
6548 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006550
6551 error:
6552 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006553 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006554 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006555 errors, &errorHandler,
6556 "unicodeescape", message,
6557 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006558 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006559 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006560 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006561 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006562
6563#undef WRITE_ASCII_CHAR
6564#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006566
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006567 Py_XDECREF(errorHandler);
6568 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006569 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006570
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006572 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006573 Py_XDECREF(errorHandler);
6574 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 return NULL;
6576}
6577
Eric V. Smith42454af2016-10-31 09:22:08 -04006578PyObject *
6579PyUnicode_DecodeUnicodeEscape(const char *s,
6580 Py_ssize_t size,
6581 const char *errors)
6582{
6583 const char *first_invalid_escape;
6584 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6585 &first_invalid_escape);
6586 if (result == NULL)
6587 return NULL;
6588 if (first_invalid_escape != NULL) {
6589 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6590 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006591 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006592 Py_DECREF(result);
6593 return NULL;
6594 }
6595 }
6596 return result;
6597}
6598
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006599/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600
Alexander Belopolsky40018472011-02-26 01:02:56 +00006601PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006602PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006604 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006605 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006607 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006608 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006609 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610
Ezio Melottie7f90372012-10-05 03:33:31 +03006611 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006612 escape.
6613
Ezio Melottie7f90372012-10-05 03:33:31 +03006614 For UCS1 strings it's '\xxx', 4 bytes per source character.
6615 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6616 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006617 */
6618
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006619 if (!PyUnicode_Check(unicode)) {
6620 PyErr_BadArgument();
6621 return NULL;
6622 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006623 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006624 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 }
Victor Stinner358af132015-10-12 22:36:57 +02006626
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006627 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006628 if (len == 0) {
6629 return PyBytes_FromStringAndSize(NULL, 0);
6630 }
6631
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006632 kind = PyUnicode_KIND(unicode);
6633 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006634 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6635 bytes, and 1 byte characters 4. */
6636 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006637 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006638 return PyErr_NoMemory();
6639 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006640 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006641 if (repr == NULL) {
6642 return NULL;
6643 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006646 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006647 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006648
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 /* U+0000-U+00ff range */
6650 if (ch < 0x100) {
6651 if (ch >= ' ' && ch < 127) {
6652 if (ch != '\\') {
6653 /* Copy printable US ASCII as-is */
6654 *p++ = (char) ch;
6655 }
6656 /* Escape backslashes */
6657 else {
6658 *p++ = '\\';
6659 *p++ = '\\';
6660 }
6661 }
Victor Stinner358af132015-10-12 22:36:57 +02006662
Victor Stinner62ec3312016-09-06 17:04:34 -07006663 /* Map special whitespace to '\t', \n', '\r' */
6664 else if (ch == '\t') {
6665 *p++ = '\\';
6666 *p++ = 't';
6667 }
6668 else if (ch == '\n') {
6669 *p++ = '\\';
6670 *p++ = 'n';
6671 }
6672 else if (ch == '\r') {
6673 *p++ = '\\';
6674 *p++ = 'r';
6675 }
6676
6677 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6678 else {
6679 *p++ = '\\';
6680 *p++ = 'x';
6681 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6682 *p++ = Py_hexdigits[ch & 0x000F];
6683 }
Tim Petersced69f82003-09-16 20:30:58 +00006684 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006685 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006686 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 *p++ = '\\';
6688 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006689 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6690 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6691 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6692 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006694 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6695 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006696
Victor Stinner62ec3312016-09-06 17:04:34 -07006697 /* Make sure that the first two digits are zero */
6698 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006699 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006700 *p++ = 'U';
6701 *p++ = '0';
6702 *p++ = '0';
6703 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6704 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6705 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6706 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6707 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6708 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711
Victor Stinner62ec3312016-09-06 17:04:34 -07006712 assert(p - PyBytes_AS_STRING(repr) > 0);
6713 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6714 return NULL;
6715 }
6716 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
Alexander Belopolsky40018472011-02-26 01:02:56 +00006719PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006720PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6721 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006723 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006724 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006725 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006727 }
6728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006729 result = PyUnicode_AsUnicodeEscapeString(tmp);
6730 Py_DECREF(tmp);
6731 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732}
6733
6734/* --- Raw Unicode Escape Codec ------------------------------------------- */
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736PyObject *
6737PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006738 Py_ssize_t size,
6739 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006742 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 PyObject *errorHandler = NULL;
6745 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006746
Victor Stinner62ec3312016-09-06 17:04:34 -07006747 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006748 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006749 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006750
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 /* Escaped strings will always be longer than the resulting
6752 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006753 length after conversion to the true value. (But decoding error
6754 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006755 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006756 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006757 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6758 goto onError;
6759 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 end = s + size;
6762 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006763 unsigned char c = (unsigned char) *s++;
6764 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006765 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006766 Py_ssize_t startinpos;
6767 Py_ssize_t endinpos;
6768 const char *message;
6769
6770#define WRITE_CHAR(ch) \
6771 do { \
6772 if (ch <= writer.maxchar) { \
6773 assert(writer.pos < writer.size); \
6774 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6775 } \
6776 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6777 goto onError; \
6778 } \
6779 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006782 if (c != '\\' || s >= end) {
6783 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006785 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006786
Victor Stinner62ec3312016-09-06 17:04:34 -07006787 c = (unsigned char) *s++;
6788 if (c == 'u') {
6789 count = 4;
6790 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006792 else if (c == 'U') {
6793 count = 8;
6794 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006795 }
6796 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006797 assert(writer.pos < writer.size);
6798 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6799 WRITE_CHAR(c);
6800 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006801 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006802 startinpos = s - starts - 2;
6803
6804 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6805 for (ch = 0; count && s < end; ++s, --count) {
6806 c = (unsigned char)*s;
6807 ch <<= 4;
6808 if (c >= '0' && c <= '9') {
6809 ch += c - '0';
6810 }
6811 else if (c >= 'a' && c <= 'f') {
6812 ch += c - ('a' - 10);
6813 }
6814 else if (c >= 'A' && c <= 'F') {
6815 ch += c - ('A' - 10);
6816 }
6817 else {
6818 break;
6819 }
6820 }
6821 if (!count) {
6822 if (ch <= MAX_UNICODE) {
6823 WRITE_CHAR(ch);
6824 continue;
6825 }
6826 message = "\\Uxxxxxxxx out of range";
6827 }
6828
6829 endinpos = s-starts;
6830 writer.min_length = end - s + writer.pos;
6831 if (unicode_decode_call_errorhandler_writer(
6832 errors, &errorHandler,
6833 "rawunicodeescape", message,
6834 &starts, &end, &startinpos, &endinpos, &exc, &s,
6835 &writer)) {
6836 goto onError;
6837 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006838 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006839
6840#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842 Py_XDECREF(errorHandler);
6843 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006844 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006845
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006847 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 Py_XDECREF(errorHandler);
6849 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006851
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852}
6853
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006854
Alexander Belopolsky40018472011-02-26 01:02:56 +00006855PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006856PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857{
Victor Stinner62ec3312016-09-06 17:04:34 -07006858 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006860 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006861 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006862 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006863 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006865 if (!PyUnicode_Check(unicode)) {
6866 PyErr_BadArgument();
6867 return NULL;
6868 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006869 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006870 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006871 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006872 kind = PyUnicode_KIND(unicode);
6873 data = PyUnicode_DATA(unicode);
6874 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006875 if (kind == PyUnicode_1BYTE_KIND) {
6876 return PyBytes_FromStringAndSize(data, len);
6877 }
Victor Stinner0e368262011-11-10 20:12:49 +01006878
Victor Stinner62ec3312016-09-06 17:04:34 -07006879 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6880 bytes, and 1 byte characters 4. */
6881 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006882
Victor Stinner62ec3312016-09-06 17:04:34 -07006883 if (len > PY_SSIZE_T_MAX / expandsize) {
6884 return PyErr_NoMemory();
6885 }
6886 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6887 if (repr == NULL) {
6888 return NULL;
6889 }
6890 if (len == 0) {
6891 return repr;
6892 }
6893
6894 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006895 for (pos = 0; pos < len; pos++) {
6896 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006897
Victor Stinner62ec3312016-09-06 17:04:34 -07006898 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6899 if (ch < 0x100) {
6900 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006901 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006902 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006903 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 *p++ = '\\';
6905 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006906 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6907 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6908 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6909 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006911 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6912 else {
6913 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6914 *p++ = '\\';
6915 *p++ = 'U';
6916 *p++ = '0';
6917 *p++ = '0';
6918 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6919 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6920 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6921 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6922 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6923 *p++ = Py_hexdigits[ch & 15];
6924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006926
Victor Stinner62ec3312016-09-06 17:04:34 -07006927 assert(p > PyBytes_AS_STRING(repr));
6928 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6929 return NULL;
6930 }
6931 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932}
6933
Alexander Belopolsky40018472011-02-26 01:02:56 +00006934PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006935PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6936 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006938 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006939 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006940 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006941 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006942 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6943 Py_DECREF(tmp);
6944 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945}
6946
6947/* --- Latin-1 Codec ------------------------------------------------------ */
6948
Alexander Belopolsky40018472011-02-26 01:02:56 +00006949PyObject *
6950PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006951 Py_ssize_t size,
6952 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006955 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956}
6957
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006958/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006959static void
6960make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006961 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006962 PyObject *unicode,
6963 Py_ssize_t startpos, Py_ssize_t endpos,
6964 const char *reason)
6965{
6966 if (*exceptionObject == NULL) {
6967 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006968 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006969 encoding, unicode, startpos, endpos, reason);
6970 }
6971 else {
6972 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6973 goto onError;
6974 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6975 goto onError;
6976 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6977 goto onError;
6978 return;
6979 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006980 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006981 }
6982}
6983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006984/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006985static void
6986raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006987 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006988 PyObject *unicode,
6989 Py_ssize_t startpos, Py_ssize_t endpos,
6990 const char *reason)
6991{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006992 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006993 encoding, unicode, startpos, endpos, reason);
6994 if (*exceptionObject != NULL)
6995 PyCodec_StrictErrors(*exceptionObject);
6996}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006997
6998/* error handling callback helper:
6999 build arguments, call the callback and check the arguments,
7000 put the result into newpos and return the replacement string, which
7001 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007002static PyObject *
7003unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007004 PyObject **errorHandler,
7005 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007006 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007007 Py_ssize_t startpos, Py_ssize_t endpos,
7008 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007009{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007010 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007011 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007012 PyObject *restuple;
7013 PyObject *resunicode;
7014
7015 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007017 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 }
7020
Benjamin Petersonbac79492012-01-14 13:34:47 -05007021 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022 return NULL;
7023 len = PyUnicode_GET_LENGTH(unicode);
7024
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007025 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007026 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007027 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007029
Petr Viktorinffd97532020-02-11 17:46:57 +01007030 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007031 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007033 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007034 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 Py_DECREF(restuple);
7036 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007037 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007038 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 &resunicode, newpos)) {
7040 Py_DECREF(restuple);
7041 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007042 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007043 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7044 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7045 Py_DECREF(restuple);
7046 return NULL;
7047 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007049 *newpos = len + *newpos;
7050 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007051 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 Py_DECREF(restuple);
7053 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007054 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 Py_INCREF(resunicode);
7056 Py_DECREF(restuple);
7057 return resunicode;
7058}
7059
Alexander Belopolsky40018472011-02-26 01:02:56 +00007060static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007061unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007062 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007063 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007065 /* input state */
7066 Py_ssize_t pos=0, size;
7067 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007068 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007069 /* pointer into the output */
7070 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007071 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7072 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007073 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007075 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007076 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007077 /* output object */
7078 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007079
Benjamin Petersonbac79492012-01-14 13:34:47 -05007080 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081 return NULL;
7082 size = PyUnicode_GET_LENGTH(unicode);
7083 kind = PyUnicode_KIND(unicode);
7084 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007085 /* allocate enough for a simple encoding without
7086 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007087 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007088 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007089
7090 _PyBytesWriter_Init(&writer);
7091 str = _PyBytesWriter_Alloc(&writer, size);
7092 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007095 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007096 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007097
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007099 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007101 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007102 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007103 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007105 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007107 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007108 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007110
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007111 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007113
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007114 /* Only overallocate the buffer if it's not the last write */
7115 writer.overallocate = (collend < size);
7116
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007118 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007119 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007120
7121 switch (error_handler) {
7122 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007123 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007125
7126 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007127 memset(str, '?', collend - collstart);
7128 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007129 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007130 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007131 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 break;
Victor Stinner50149202015-09-22 00:26:54 +02007133
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007134 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007135 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007136 writer.min_size -= (collend - collstart);
7137 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007138 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007139 if (str == NULL)
7140 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007141 pos = collend;
7142 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007143
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007144 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007145 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007146 writer.min_size -= (collend - collstart);
7147 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007148 unicode, collstart, collend);
7149 if (str == NULL)
7150 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007151 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 break;
Victor Stinner50149202015-09-22 00:26:54 +02007153
Victor Stinnerc3713e92015-09-29 12:32:13 +02007154 case _Py_ERROR_SURROGATEESCAPE:
7155 for (i = collstart; i < collend; ++i) {
7156 ch = PyUnicode_READ(kind, data, i);
7157 if (ch < 0xdc80 || 0xdcff < ch) {
7158 /* Not a UTF-8b surrogate */
7159 break;
7160 }
7161 *str++ = (char)(ch - 0xdc00);
7162 ++pos;
7163 }
7164 if (i >= collend)
7165 break;
7166 collstart = pos;
7167 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007168 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007169
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007171 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7172 encoding, reason, unicode, &exc,
7173 collstart, collend, &newpos);
7174 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007176
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007177 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007178 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007179
Victor Stinner6bd525b2015-10-09 13:10:05 +02007180 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007181 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007182 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007183 PyBytes_AS_STRING(rep),
7184 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007185 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007186 else {
7187 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007188
Victor Stinner6bd525b2015-10-09 13:10:05 +02007189 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007191
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007192 if (limit == 256 ?
7193 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7194 !PyUnicode_IS_ASCII(rep))
7195 {
7196 /* Not all characters are smaller than limit */
7197 raise_encode_exception(&exc, encoding, unicode,
7198 collstart, collend, reason);
7199 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007201 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7202 str = _PyBytesWriter_WriteBytes(&writer, str,
7203 PyUnicode_DATA(rep),
7204 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007206 if (str == NULL)
7207 goto onError;
7208
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007209 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007210 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007211 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007212
7213 /* If overallocation was disabled, ensure that it was the last
7214 write. Otherwise, we missed an optimization */
7215 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007216 }
7217 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007218
Victor Stinner50149202015-09-22 00:26:54 +02007219 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007220 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007221 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007222
7223 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007224 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007225 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007226 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007227 Py_XDECREF(exc);
7228 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229}
7230
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007231/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007232PyObject *
7233PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007234 Py_ssize_t size,
7235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007237 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007238 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007239 if (unicode == NULL)
7240 return NULL;
7241 result = unicode_encode_ucs1(unicode, errors, 256);
7242 Py_DECREF(unicode);
7243 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244}
7245
Alexander Belopolsky40018472011-02-26 01:02:56 +00007246PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007247_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248{
7249 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 PyErr_BadArgument();
7251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253 if (PyUnicode_READY(unicode) == -1)
7254 return NULL;
7255 /* Fast path: if it is a one-byte string, construct
7256 bytes object directly. */
7257 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7258 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7259 PyUnicode_GET_LENGTH(unicode));
7260 /* Non-Latin-1 characters present. Defer to above function to
7261 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007262 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007263}
7264
7265PyObject*
7266PyUnicode_AsLatin1String(PyObject *unicode)
7267{
7268 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269}
7270
7271/* --- 7-bit ASCII Codec -------------------------------------------------- */
7272
Alexander Belopolsky40018472011-02-26 01:02:56 +00007273PyObject *
7274PyUnicode_DecodeASCII(const char *s,
7275 Py_ssize_t size,
7276 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007279 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007280 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007281 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007282 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007283
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007285 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007286
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007288 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007289 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007291
Inada Naoki770847a2019-06-24 12:30:24 +09007292 // Shortcut for simple case
7293 PyObject *u = PyUnicode_New(size, 127);
7294 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007295 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007296 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007297 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007298 if (outpos == size) {
7299 return u;
7300 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007301
Inada Naoki770847a2019-06-24 12:30:24 +09007302 _PyUnicodeWriter writer;
7303 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007304 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007305
Inada Naoki770847a2019-06-24 12:30:24 +09007306 s += outpos;
7307 int kind = writer.kind;
7308 void *data = writer.data;
7309 Py_ssize_t startinpos, endinpos;
7310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007311 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007312 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007314 PyUnicode_WRITE(kind, data, writer.pos, c);
7315 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007317 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007319
7320 /* byte outsize range 0x00..0x7f: call the error handler */
7321
7322 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007323 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007324
7325 switch (error_handler)
7326 {
7327 case _Py_ERROR_REPLACE:
7328 case _Py_ERROR_SURROGATEESCAPE:
7329 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007330 but we may switch to UCS2 at the first write */
7331 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7332 goto onError;
7333 kind = writer.kind;
7334 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007335
7336 if (error_handler == _Py_ERROR_REPLACE)
7337 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7338 else
7339 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7340 writer.pos++;
7341 ++s;
7342 break;
7343
7344 case _Py_ERROR_IGNORE:
7345 ++s;
7346 break;
7347
7348 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 startinpos = s-starts;
7350 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007351 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007352 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 "ascii", "ordinal not in range(128)",
7354 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007355 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007357 kind = writer.kind;
7358 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007361 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007362 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007363 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007364
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007366 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007367 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007368 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369 return NULL;
7370}
7371
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007372/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007373PyObject *
7374PyUnicode_EncodeASCII(const Py_UNICODE *p,
7375 Py_ssize_t size,
7376 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007378 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007379 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007380 if (unicode == NULL)
7381 return NULL;
7382 result = unicode_encode_ucs1(unicode, errors, 128);
7383 Py_DECREF(unicode);
7384 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385}
7386
Alexander Belopolsky40018472011-02-26 01:02:56 +00007387PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007388_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389{
7390 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 PyErr_BadArgument();
7392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007394 if (PyUnicode_READY(unicode) == -1)
7395 return NULL;
7396 /* Fast path: if it is an ASCII-only string, construct bytes object
7397 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007398 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007399 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7400 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007401 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007402}
7403
7404PyObject *
7405PyUnicode_AsASCIIString(PyObject *unicode)
7406{
7407 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408}
7409
Steve Dowercc16be82016-09-08 10:35:16 -07007410#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007411
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007412/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007413
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007414#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007415#define NEED_RETRY
7416#endif
7417
Steve Dower7ebdda02019-08-21 16:22:33 -07007418/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7419 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7420 both cases also and avoids partial characters overrunning the
7421 length limit in MultiByteToWideChar on Windows */
7422#define DECODING_CHUNK_SIZE (INT_MAX/4)
7423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424#ifndef WC_ERR_INVALID_CHARS
7425# define WC_ERR_INVALID_CHARS 0x0080
7426#endif
7427
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007428static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007429code_page_name(UINT code_page, PyObject **obj)
7430{
7431 *obj = NULL;
7432 if (code_page == CP_ACP)
7433 return "mbcs";
7434 if (code_page == CP_UTF7)
7435 return "CP_UTF7";
7436 if (code_page == CP_UTF8)
7437 return "CP_UTF8";
7438
7439 *obj = PyBytes_FromFormat("cp%u", code_page);
7440 if (*obj == NULL)
7441 return NULL;
7442 return PyBytes_AS_STRING(*obj);
7443}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444
Victor Stinner3a50e702011-10-18 21:21:00 +02007445static DWORD
7446decode_code_page_flags(UINT code_page)
7447{
7448 if (code_page == CP_UTF7) {
7449 /* The CP_UTF7 decoder only supports flags=0 */
7450 return 0;
7451 }
7452 else
7453 return MB_ERR_INVALID_CHARS;
7454}
7455
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 * Decode a byte string from a Windows code page into unicode object in strict
7458 * mode.
7459 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007460 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7461 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007463static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007464decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007465 wchar_t **buf,
7466 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 const char *in,
7468 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007470 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007471 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473
7474 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007476 while ((outsize = MultiByteToWideChar(code_page, flags,
7477 in, insize, NULL, 0)) <= 0)
7478 {
7479 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7480 goto error;
7481 }
7482 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7483 flags = 0;
7484 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007485
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007486 /* Extend a wchar_t* buffer */
7487 Py_ssize_t n = *bufsize; /* Get the current length */
7488 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7489 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007491 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007492
7493 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7495 if (outsize <= 0)
7496 goto error;
7497 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007498
Victor Stinner3a50e702011-10-18 21:21:00 +02007499error:
7500 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7501 return -2;
7502 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007503 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007504}
7505
Victor Stinner3a50e702011-10-18 21:21:00 +02007506/*
7507 * Decode a byte string from a code page into unicode object with an error
7508 * handler.
7509 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007510 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 * UnicodeDecodeError exception and returns -1 on error.
7512 */
7513static int
7514decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007515 wchar_t **buf,
7516 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007517 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007518 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007519{
7520 const char *startin = in;
7521 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007522 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 /* Ideally, we should get reason from FormatMessage. This is the Windows
7524 2000 English version of the message. */
7525 const char *reason = "No mapping for the Unicode character exists "
7526 "in the target code page.";
7527 /* each step cannot decode more than 1 character, but a character can be
7528 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007529 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007530 int insize;
7531 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 PyObject *errorHandler = NULL;
7533 PyObject *exc = NULL;
7534 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007535 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 DWORD err;
7537 int ret = -1;
7538
7539 assert(size > 0);
7540
7541 encoding = code_page_name(code_page, &encoding_obj);
7542 if (encoding == NULL)
7543 return -1;
7544
Victor Stinner7d00cc12014-03-17 23:08:06 +01007545 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7547 UnicodeDecodeError. */
7548 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7549 if (exc != NULL) {
7550 PyCodec_StrictErrors(exc);
7551 Py_CLEAR(exc);
7552 }
7553 goto error;
7554 }
7555
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007556 /* Extend a wchar_t* buffer */
7557 Py_ssize_t n = *bufsize; /* Get the current length */
7558 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7559 PyErr_NoMemory();
7560 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007562 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7563 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007565 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007566
7567 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 while (in < endin)
7569 {
7570 /* Decode a character */
7571 insize = 1;
7572 do
7573 {
7574 outsize = MultiByteToWideChar(code_page, flags,
7575 in, insize,
7576 buffer, Py_ARRAY_LENGTH(buffer));
7577 if (outsize > 0)
7578 break;
7579 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007580 if (err == ERROR_INVALID_FLAGS && flags) {
7581 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7582 flags = 0;
7583 continue;
7584 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 if (err != ERROR_NO_UNICODE_TRANSLATION
7586 && err != ERROR_INSUFFICIENT_BUFFER)
7587 {
7588 PyErr_SetFromWindowsErr(0);
7589 goto error;
7590 }
7591 insize++;
7592 }
7593 /* 4=maximum length of a UTF-8 sequence */
7594 while (insize <= 4 && (in + insize) <= endin);
7595
7596 if (outsize <= 0) {
7597 Py_ssize_t startinpos, endinpos, outpos;
7598
Victor Stinner7d00cc12014-03-17 23:08:06 +01007599 /* last character in partial decode? */
7600 if (in + insize >= endin && !final)
7601 break;
7602
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 startinpos = in - startin;
7604 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007605 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007606 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 errors, &errorHandler,
7608 encoding, reason,
7609 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007610 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 {
7612 goto error;
7613 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007614 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 }
7616 else {
7617 in += insize;
7618 memcpy(out, buffer, outsize * sizeof(wchar_t));
7619 out += outsize;
7620 }
7621 }
7622
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007623 /* Shrink the buffer */
7624 assert(out - *buf <= *bufsize);
7625 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007626 /* (in - startin) <= size and size is an int */
7627 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007628
7629error:
7630 Py_XDECREF(encoding_obj);
7631 Py_XDECREF(errorHandler);
7632 Py_XDECREF(exc);
7633 return ret;
7634}
7635
Victor Stinner3a50e702011-10-18 21:21:00 +02007636static PyObject *
7637decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007638 const char *s, Py_ssize_t size,
7639 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007640{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007641 wchar_t *buf = NULL;
7642 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007643 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007644
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 if (code_page < 0) {
7646 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7647 return NULL;
7648 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007649 if (size < 0) {
7650 PyErr_BadInternalCall();
7651 return NULL;
7652 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007653
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007654 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007656
Victor Stinner76a31a62011-11-04 00:05:13 +01007657 do
7658 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007659#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007660 if (size > DECODING_CHUNK_SIZE) {
7661 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007662 final = 0;
7663 done = 0;
7664 }
7665 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007666#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007667 {
7668 chunk_size = (int)size;
7669 final = (consumed == NULL);
7670 done = 1;
7671 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007672
Victor Stinner76a31a62011-11-04 00:05:13 +01007673 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007674 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007675 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007676 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007677 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007678
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007679 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007680 s, chunk_size);
7681 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007682 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007683 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007684 errors, final);
7685 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007686
7687 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007688 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007689 return NULL;
7690 }
7691
7692 if (consumed)
7693 *consumed += converted;
7694
7695 s += converted;
7696 size -= converted;
7697 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007698
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007699 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7700 PyMem_Free(buf);
7701 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007702}
7703
Alexander Belopolsky40018472011-02-26 01:02:56 +00007704PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007705PyUnicode_DecodeCodePageStateful(int code_page,
7706 const char *s,
7707 Py_ssize_t size,
7708 const char *errors,
7709 Py_ssize_t *consumed)
7710{
7711 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7712}
7713
7714PyObject *
7715PyUnicode_DecodeMBCSStateful(const char *s,
7716 Py_ssize_t size,
7717 const char *errors,
7718 Py_ssize_t *consumed)
7719{
7720 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7721}
7722
7723PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007724PyUnicode_DecodeMBCS(const char *s,
7725 Py_ssize_t size,
7726 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007727{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007728 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7729}
7730
Victor Stinner3a50e702011-10-18 21:21:00 +02007731static DWORD
7732encode_code_page_flags(UINT code_page, const char *errors)
7733{
7734 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007735 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 }
7737 else if (code_page == CP_UTF7) {
7738 /* CP_UTF7 only supports flags=0 */
7739 return 0;
7740 }
7741 else {
7742 if (errors != NULL && strcmp(errors, "replace") == 0)
7743 return 0;
7744 else
7745 return WC_NO_BEST_FIT_CHARS;
7746 }
7747}
7748
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007749/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007750 * Encode a Unicode string to a Windows code page into a byte string in strict
7751 * mode.
7752 *
7753 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007754 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007755 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007756static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007757encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007759 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007760{
Victor Stinner554f3f02010-06-16 23:33:54 +00007761 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 BOOL *pusedDefaultChar = &usedDefaultChar;
7763 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007764 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007765 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007766 const DWORD flags = encode_code_page_flags(code_page, NULL);
7767 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007768 /* Create a substring so that we can get the UTF-16 representation
7769 of just the slice under consideration. */
7770 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007771 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007772
Martin v. Löwis3d325192011-11-04 18:23:06 +01007773 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007774
Victor Stinner3a50e702011-10-18 21:21:00 +02007775 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007776 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007777 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007778 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007779
Victor Stinner2fc507f2011-11-04 20:06:39 +01007780 substring = PyUnicode_Substring(unicode, offset, offset+len);
7781 if (substring == NULL)
7782 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007783#if USE_UNICODE_WCHAR_CACHE
7784_Py_COMP_DIAG_PUSH
7785_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007786 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7787 if (p == NULL) {
7788 Py_DECREF(substring);
7789 return -1;
7790 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007791_Py_COMP_DIAG_POP
7792#else /* USE_UNICODE_WCHAR_CACHE */
7793 p = PyUnicode_AsWideCharString(substring, &size);
7794 Py_CLEAR(substring);
7795 if (p == NULL) {
7796 return -1;
7797 }
7798#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007799 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007800
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007801 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007802 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007803 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007804 NULL, 0,
7805 NULL, pusedDefaultChar);
7806 if (outsize <= 0)
7807 goto error;
7808 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007809 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007810 ret = -2;
7811 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007812 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007813
Victor Stinner3a50e702011-10-18 21:21:00 +02007814 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007816 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007817 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007818 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007819 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007820 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007821 }
7822 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007824 const Py_ssize_t n = PyBytes_Size(*outbytes);
7825 if (outsize > PY_SSIZE_T_MAX - n) {
7826 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007827 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007828 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007829 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007830 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007831 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007832 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007833 }
7834
7835 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007836 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007837 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007838 out, outsize,
7839 NULL, pusedDefaultChar);
7840 if (outsize <= 0)
7841 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007842 if (pusedDefaultChar && *pusedDefaultChar) {
7843 ret = -2;
7844 goto done;
7845 }
7846 ret = 0;
7847
7848done:
7849#if USE_UNICODE_WCHAR_CACHE
7850 Py_DECREF(substring);
7851#else /* USE_UNICODE_WCHAR_CACHE */
7852 PyMem_Free(p);
7853#endif /* USE_UNICODE_WCHAR_CACHE */
7854 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007855
Victor Stinner3a50e702011-10-18 21:21:00 +02007856error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007857 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7858 ret = -2;
7859 goto done;
7860 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007861 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007862 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007863}
7864
Victor Stinner3a50e702011-10-18 21:21:00 +02007865/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007866 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007867 * error handler.
7868 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007869 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007870 * -1 on other error.
7871 */
7872static int
7873encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007874 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007875 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007876{
Victor Stinner3a50e702011-10-18 21:21:00 +02007877 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007878 Py_ssize_t pos = unicode_offset;
7879 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007880 /* Ideally, we should get reason from FormatMessage. This is the Windows
7881 2000 English version of the message. */
7882 const char *reason = "invalid character";
7883 /* 4=maximum length of a UTF-8 sequence */
7884 char buffer[4];
7885 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7886 Py_ssize_t outsize;
7887 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007888 PyObject *errorHandler = NULL;
7889 PyObject *exc = NULL;
7890 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007891 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007892 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007893 PyObject *rep;
7894 int ret = -1;
7895
7896 assert(insize > 0);
7897
7898 encoding = code_page_name(code_page, &encoding_obj);
7899 if (encoding == NULL)
7900 return -1;
7901
7902 if (errors == NULL || strcmp(errors, "strict") == 0) {
7903 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7904 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007905 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007906 if (exc != NULL) {
7907 PyCodec_StrictErrors(exc);
7908 Py_DECREF(exc);
7909 }
7910 Py_XDECREF(encoding_obj);
7911 return -1;
7912 }
7913
7914 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7915 pusedDefaultChar = &usedDefaultChar;
7916 else
7917 pusedDefaultChar = NULL;
7918
7919 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7920 PyErr_NoMemory();
7921 goto error;
7922 }
7923 outsize = insize * Py_ARRAY_LENGTH(buffer);
7924
7925 if (*outbytes == NULL) {
7926 /* Create string object */
7927 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7928 if (*outbytes == NULL)
7929 goto error;
7930 out = PyBytes_AS_STRING(*outbytes);
7931 }
7932 else {
7933 /* Extend string object */
7934 Py_ssize_t n = PyBytes_Size(*outbytes);
7935 if (n > PY_SSIZE_T_MAX - outsize) {
7936 PyErr_NoMemory();
7937 goto error;
7938 }
7939 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7940 goto error;
7941 out = PyBytes_AS_STRING(*outbytes) + n;
7942 }
7943
7944 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007945 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007946 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007947 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7948 wchar_t chars[2];
7949 int charsize;
7950 if (ch < 0x10000) {
7951 chars[0] = (wchar_t)ch;
7952 charsize = 1;
7953 }
7954 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007955 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7956 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007957 charsize = 2;
7958 }
7959
Victor Stinner3a50e702011-10-18 21:21:00 +02007960 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007961 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007962 buffer, Py_ARRAY_LENGTH(buffer),
7963 NULL, pusedDefaultChar);
7964 if (outsize > 0) {
7965 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7966 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007967 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007968 memcpy(out, buffer, outsize);
7969 out += outsize;
7970 continue;
7971 }
7972 }
7973 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7974 PyErr_SetFromWindowsErr(0);
7975 goto error;
7976 }
7977
Victor Stinner3a50e702011-10-18 21:21:00 +02007978 rep = unicode_encode_call_errorhandler(
7979 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007980 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007981 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007982 if (rep == NULL)
7983 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007984 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007985
7986 if (PyBytes_Check(rep)) {
7987 outsize = PyBytes_GET_SIZE(rep);
7988 if (outsize != 1) {
7989 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7990 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7991 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7992 Py_DECREF(rep);
7993 goto error;
7994 }
7995 out = PyBytes_AS_STRING(*outbytes) + offset;
7996 }
7997 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7998 out += outsize;
7999 }
8000 else {
8001 Py_ssize_t i;
8002 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008003 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008004
Benjamin Petersonbac79492012-01-14 13:34:47 -05008005 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008006 Py_DECREF(rep);
8007 goto error;
8008 }
8009
8010 outsize = PyUnicode_GET_LENGTH(rep);
8011 if (outsize != 1) {
8012 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8013 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8014 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8015 Py_DECREF(rep);
8016 goto error;
8017 }
8018 out = PyBytes_AS_STRING(*outbytes) + offset;
8019 }
8020 kind = PyUnicode_KIND(rep);
8021 data = PyUnicode_DATA(rep);
8022 for (i=0; i < outsize; i++) {
8023 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8024 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008025 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008026 encoding, unicode,
8027 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008028 "unable to encode error handler result to ASCII");
8029 Py_DECREF(rep);
8030 goto error;
8031 }
8032 *out = (unsigned char)ch;
8033 out++;
8034 }
8035 }
8036 Py_DECREF(rep);
8037 }
8038 /* write a NUL byte */
8039 *out = 0;
8040 outsize = out - PyBytes_AS_STRING(*outbytes);
8041 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8042 if (_PyBytes_Resize(outbytes, outsize) < 0)
8043 goto error;
8044 ret = 0;
8045
8046error:
8047 Py_XDECREF(encoding_obj);
8048 Py_XDECREF(errorHandler);
8049 Py_XDECREF(exc);
8050 return ret;
8051}
8052
Victor Stinner3a50e702011-10-18 21:21:00 +02008053static PyObject *
8054encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008055 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008056 const char *errors)
8057{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008058 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008059 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008060 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008061 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008062
Victor Stinner29dacf22015-01-26 16:41:32 +01008063 if (!PyUnicode_Check(unicode)) {
8064 PyErr_BadArgument();
8065 return NULL;
8066 }
8067
Benjamin Petersonbac79492012-01-14 13:34:47 -05008068 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008069 return NULL;
8070 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008071
Victor Stinner3a50e702011-10-18 21:21:00 +02008072 if (code_page < 0) {
8073 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8074 return NULL;
8075 }
8076
Martin v. Löwis3d325192011-11-04 18:23:06 +01008077 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008078 return PyBytes_FromStringAndSize(NULL, 0);
8079
Victor Stinner7581cef2011-11-03 22:32:33 +01008080 offset = 0;
8081 do
8082 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008083#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008084 if (len > DECODING_CHUNK_SIZE) {
8085 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008086 done = 0;
8087 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008088 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008089#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008090 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008091 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008092 done = 1;
8093 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008094
Victor Stinner76a31a62011-11-04 00:05:13 +01008095 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008096 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008097 errors);
8098 if (ret == -2)
8099 ret = encode_code_page_errors(code_page, &outbytes,
8100 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008101 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008102 if (ret < 0) {
8103 Py_XDECREF(outbytes);
8104 return NULL;
8105 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008106
Victor Stinner7581cef2011-11-03 22:32:33 +01008107 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008108 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008109 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008110
Victor Stinner3a50e702011-10-18 21:21:00 +02008111 return outbytes;
8112}
8113
8114PyObject *
8115PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8116 Py_ssize_t size,
8117 const char *errors)
8118{
Victor Stinner7581cef2011-11-03 22:32:33 +01008119 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008120 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008121 if (unicode == NULL)
8122 return NULL;
8123 res = encode_code_page(CP_ACP, unicode, errors);
8124 Py_DECREF(unicode);
8125 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008126}
8127
8128PyObject *
8129PyUnicode_EncodeCodePage(int code_page,
8130 PyObject *unicode,
8131 const char *errors)
8132{
Victor Stinner7581cef2011-11-03 22:32:33 +01008133 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008134}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008135
Alexander Belopolsky40018472011-02-26 01:02:56 +00008136PyObject *
8137PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008138{
Victor Stinner7581cef2011-11-03 22:32:33 +01008139 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008140}
8141
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008142#undef NEED_RETRY
8143
Steve Dowercc16be82016-09-08 10:35:16 -07008144#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008145
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146/* --- Character Mapping Codec -------------------------------------------- */
8147
Victor Stinnerfb161b12013-04-18 01:44:27 +02008148static int
8149charmap_decode_string(const char *s,
8150 Py_ssize_t size,
8151 PyObject *mapping,
8152 const char *errors,
8153 _PyUnicodeWriter *writer)
8154{
8155 const char *starts = s;
8156 const char *e;
8157 Py_ssize_t startinpos, endinpos;
8158 PyObject *errorHandler = NULL, *exc = NULL;
8159 Py_ssize_t maplen;
8160 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008161 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008162 Py_UCS4 x;
8163 unsigned char ch;
8164
8165 if (PyUnicode_READY(mapping) == -1)
8166 return -1;
8167
8168 maplen = PyUnicode_GET_LENGTH(mapping);
8169 mapdata = PyUnicode_DATA(mapping);
8170 mapkind = PyUnicode_KIND(mapping);
8171
8172 e = s + size;
8173
8174 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8175 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8176 * is disabled in encoding aliases, latin1 is preferred because
8177 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008178 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008179 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8180 Py_UCS4 maxchar = writer->maxchar;
8181
8182 assert (writer->kind == PyUnicode_1BYTE_KIND);
8183 while (s < e) {
8184 ch = *s;
8185 x = mapdata_ucs1[ch];
8186 if (x > maxchar) {
8187 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8188 goto onError;
8189 maxchar = writer->maxchar;
8190 outdata = (Py_UCS1 *)writer->data;
8191 }
8192 outdata[writer->pos] = x;
8193 writer->pos++;
8194 ++s;
8195 }
8196 return 0;
8197 }
8198
8199 while (s < e) {
8200 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8201 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008202 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008203 if (outkind == PyUnicode_1BYTE_KIND) {
8204 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8205 Py_UCS4 maxchar = writer->maxchar;
8206 while (s < e) {
8207 ch = *s;
8208 x = mapdata_ucs2[ch];
8209 if (x > maxchar)
8210 goto Error;
8211 outdata[writer->pos] = x;
8212 writer->pos++;
8213 ++s;
8214 }
8215 break;
8216 }
8217 else if (outkind == PyUnicode_2BYTE_KIND) {
8218 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8219 while (s < e) {
8220 ch = *s;
8221 x = mapdata_ucs2[ch];
8222 if (x == 0xFFFE)
8223 goto Error;
8224 outdata[writer->pos] = x;
8225 writer->pos++;
8226 ++s;
8227 }
8228 break;
8229 }
8230 }
8231 ch = *s;
8232
8233 if (ch < maplen)
8234 x = PyUnicode_READ(mapkind, mapdata, ch);
8235 else
8236 x = 0xfffe; /* invalid value */
8237Error:
8238 if (x == 0xfffe)
8239 {
8240 /* undefined mapping */
8241 startinpos = s-starts;
8242 endinpos = startinpos+1;
8243 if (unicode_decode_call_errorhandler_writer(
8244 errors, &errorHandler,
8245 "charmap", "character maps to <undefined>",
8246 &starts, &e, &startinpos, &endinpos, &exc, &s,
8247 writer)) {
8248 goto onError;
8249 }
8250 continue;
8251 }
8252
8253 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8254 goto onError;
8255 ++s;
8256 }
8257 Py_XDECREF(errorHandler);
8258 Py_XDECREF(exc);
8259 return 0;
8260
8261onError:
8262 Py_XDECREF(errorHandler);
8263 Py_XDECREF(exc);
8264 return -1;
8265}
8266
8267static int
8268charmap_decode_mapping(const char *s,
8269 Py_ssize_t size,
8270 PyObject *mapping,
8271 const char *errors,
8272 _PyUnicodeWriter *writer)
8273{
8274 const char *starts = s;
8275 const char *e;
8276 Py_ssize_t startinpos, endinpos;
8277 PyObject *errorHandler = NULL, *exc = NULL;
8278 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008279 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008280
8281 e = s + size;
8282
8283 while (s < e) {
8284 ch = *s;
8285
8286 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8287 key = PyLong_FromLong((long)ch);
8288 if (key == NULL)
8289 goto onError;
8290
8291 item = PyObject_GetItem(mapping, key);
8292 Py_DECREF(key);
8293 if (item == NULL) {
8294 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8295 /* No mapping found means: mapping is undefined. */
8296 PyErr_Clear();
8297 goto Undefined;
8298 } else
8299 goto onError;
8300 }
8301
8302 /* Apply mapping */
8303 if (item == Py_None)
8304 goto Undefined;
8305 if (PyLong_Check(item)) {
8306 long value = PyLong_AS_LONG(item);
8307 if (value == 0xFFFE)
8308 goto Undefined;
8309 if (value < 0 || value > MAX_UNICODE) {
8310 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008311 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008312 (unsigned long)MAX_UNICODE + 1);
8313 goto onError;
8314 }
8315
8316 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8317 goto onError;
8318 }
8319 else if (PyUnicode_Check(item)) {
8320 if (PyUnicode_READY(item) == -1)
8321 goto onError;
8322 if (PyUnicode_GET_LENGTH(item) == 1) {
8323 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8324 if (value == 0xFFFE)
8325 goto Undefined;
8326 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8327 goto onError;
8328 }
8329 else {
8330 writer->overallocate = 1;
8331 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8332 goto onError;
8333 }
8334 }
8335 else {
8336 /* wrong return value */
8337 PyErr_SetString(PyExc_TypeError,
8338 "character mapping must return integer, None or str");
8339 goto onError;
8340 }
8341 Py_CLEAR(item);
8342 ++s;
8343 continue;
8344
8345Undefined:
8346 /* undefined mapping */
8347 Py_CLEAR(item);
8348 startinpos = s-starts;
8349 endinpos = startinpos+1;
8350 if (unicode_decode_call_errorhandler_writer(
8351 errors, &errorHandler,
8352 "charmap", "character maps to <undefined>",
8353 &starts, &e, &startinpos, &endinpos, &exc, &s,
8354 writer)) {
8355 goto onError;
8356 }
8357 }
8358 Py_XDECREF(errorHandler);
8359 Py_XDECREF(exc);
8360 return 0;
8361
8362onError:
8363 Py_XDECREF(item);
8364 Py_XDECREF(errorHandler);
8365 Py_XDECREF(exc);
8366 return -1;
8367}
8368
Alexander Belopolsky40018472011-02-26 01:02:56 +00008369PyObject *
8370PyUnicode_DecodeCharmap(const char *s,
8371 Py_ssize_t size,
8372 PyObject *mapping,
8373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008375 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008376
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 /* Default to Latin-1 */
8378 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008382 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008383 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008384 writer.min_length = size;
8385 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008387
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008388 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008389 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8390 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008391 }
8392 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008393 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8394 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008396 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008397
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008399 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 return NULL;
8401}
8402
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008403/* Charmap encoding: the lookup table */
8404
Alexander Belopolsky40018472011-02-26 01:02:56 +00008405struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 PyObject_HEAD
8407 unsigned char level1[32];
8408 int count2, count3;
8409 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410};
8411
8412static PyObject*
8413encoding_map_size(PyObject *obj, PyObject* args)
8414{
8415 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008416 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008418}
8419
8420static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 PyDoc_STR("Return the size (in bytes) of this object") },
8423 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008424};
8425
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008426static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 "EncodingMap", /*tp_name*/
8429 sizeof(struct encoding_map), /*tp_basicsize*/
8430 0, /*tp_itemsize*/
8431 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008432 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008433 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 0, /*tp_getattr*/
8435 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008436 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 0, /*tp_repr*/
8438 0, /*tp_as_number*/
8439 0, /*tp_as_sequence*/
8440 0, /*tp_as_mapping*/
8441 0, /*tp_hash*/
8442 0, /*tp_call*/
8443 0, /*tp_str*/
8444 0, /*tp_getattro*/
8445 0, /*tp_setattro*/
8446 0, /*tp_as_buffer*/
8447 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8448 0, /*tp_doc*/
8449 0, /*tp_traverse*/
8450 0, /*tp_clear*/
8451 0, /*tp_richcompare*/
8452 0, /*tp_weaklistoffset*/
8453 0, /*tp_iter*/
8454 0, /*tp_iternext*/
8455 encoding_map_methods, /*tp_methods*/
8456 0, /*tp_members*/
8457 0, /*tp_getset*/
8458 0, /*tp_base*/
8459 0, /*tp_dict*/
8460 0, /*tp_descr_get*/
8461 0, /*tp_descr_set*/
8462 0, /*tp_dictoffset*/
8463 0, /*tp_init*/
8464 0, /*tp_alloc*/
8465 0, /*tp_new*/
8466 0, /*tp_free*/
8467 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008468};
8469
8470PyObject*
8471PyUnicode_BuildEncodingMap(PyObject* string)
8472{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008473 PyObject *result;
8474 struct encoding_map *mresult;
8475 int i;
8476 int need_dict = 0;
8477 unsigned char level1[32];
8478 unsigned char level2[512];
8479 unsigned char *mlevel1, *mlevel2, *mlevel3;
8480 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008482 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008483 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008485
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008486 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008487 PyErr_BadArgument();
8488 return NULL;
8489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 kind = PyUnicode_KIND(string);
8491 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008492 length = PyUnicode_GET_LENGTH(string);
8493 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008494 memset(level1, 0xFF, sizeof level1);
8495 memset(level2, 0xFF, sizeof level2);
8496
8497 /* If there isn't a one-to-one mapping of NULL to \0,
8498 or if there are non-BMP characters, we need to use
8499 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008501 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008502 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008503 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 ch = PyUnicode_READ(kind, data, i);
8505 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008506 need_dict = 1;
8507 break;
8508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008510 /* unmapped character */
8511 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 l1 = ch >> 11;
8513 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008514 if (level1[l1] == 0xFF)
8515 level1[l1] = count2++;
8516 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008518 }
8519
8520 if (count2 >= 0xFF || count3 >= 0xFF)
8521 need_dict = 1;
8522
8523 if (need_dict) {
8524 PyObject *result = PyDict_New();
8525 PyObject *key, *value;
8526 if (!result)
8527 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008528 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008530 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008531 if (!key || !value)
8532 goto failed1;
8533 if (PyDict_SetItem(result, key, value) == -1)
8534 goto failed1;
8535 Py_DECREF(key);
8536 Py_DECREF(value);
8537 }
8538 return result;
8539 failed1:
8540 Py_XDECREF(key);
8541 Py_XDECREF(value);
8542 Py_DECREF(result);
8543 return NULL;
8544 }
8545
8546 /* Create a three-level trie */
8547 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8548 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008549 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008550 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008551 }
8552
8553 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008554 mresult = (struct encoding_map*)result;
8555 mresult->count2 = count2;
8556 mresult->count3 = count3;
8557 mlevel1 = mresult->level1;
8558 mlevel2 = mresult->level23;
8559 mlevel3 = mresult->level23 + 16*count2;
8560 memcpy(mlevel1, level1, 32);
8561 memset(mlevel2, 0xFF, 16*count2);
8562 memset(mlevel3, 0, 128*count3);
8563 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008564 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008565 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008566 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8567 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008568 /* unmapped character */
8569 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008570 o1 = ch>>11;
8571 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008572 i2 = 16*mlevel1[o1] + o2;
8573 if (mlevel2[i2] == 0xFF)
8574 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008575 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008576 i3 = 128*mlevel2[i2] + o3;
8577 mlevel3[i3] = i;
8578 }
8579 return result;
8580}
8581
8582static int
Victor Stinner22168992011-11-20 17:09:18 +01008583encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008584{
8585 struct encoding_map *map = (struct encoding_map*)mapping;
8586 int l1 = c>>11;
8587 int l2 = (c>>7) & 0xF;
8588 int l3 = c & 0x7F;
8589 int i;
8590
Victor Stinner22168992011-11-20 17:09:18 +01008591 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008593 if (c == 0)
8594 return 0;
8595 /* level 1*/
8596 i = map->level1[l1];
8597 if (i == 0xFF) {
8598 return -1;
8599 }
8600 /* level 2*/
8601 i = map->level23[16*i+l2];
8602 if (i == 0xFF) {
8603 return -1;
8604 }
8605 /* level 3 */
8606 i = map->level23[16*map->count2 + 128*i + l3];
8607 if (i == 0) {
8608 return -1;
8609 }
8610 return i;
8611}
8612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613/* Lookup the character ch in the mapping. If the character
8614 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008615 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008616static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008617charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618{
Christian Heimes217cfd12007-12-02 14:31:20 +00008619 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 PyObject *x;
8621
8622 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624 x = PyObject_GetItem(mapping, w);
8625 Py_DECREF(w);
8626 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8628 /* No mapping found means: mapping is undefined. */
8629 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008630 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 } else
8632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008634 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008636 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 long value = PyLong_AS_LONG(x);
8638 if (value < 0 || value > 255) {
8639 PyErr_SetString(PyExc_TypeError,
8640 "character mapping must be in range(256)");
8641 Py_DECREF(x);
8642 return NULL;
8643 }
8644 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008646 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 /* wrong return value */
8650 PyErr_Format(PyExc_TypeError,
8651 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008652 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 Py_DECREF(x);
8654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 }
8656}
8657
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008658static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008659charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008661 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8662 /* exponentially overallocate to minimize reallocations */
8663 if (requiredsize < 2*outsize)
8664 requiredsize = 2*outsize;
8665 if (_PyBytes_Resize(outobj, requiredsize))
8666 return -1;
8667 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008668}
8669
Benjamin Peterson14339b62009-01-31 16:36:08 +00008670typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008674 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 space is available. Return a new reference to the object that
8676 was put in the output buffer, or Py_None, if the mapping was undefined
8677 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008678 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008679static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008680charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008681 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008683 PyObject *rep;
8684 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008685 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686
Andy Lesterdffe4c02020-03-04 07:15:20 -06008687 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008688 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008690 if (res == -1)
8691 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 if (outsize<requiredsize)
8693 if (charmapencode_resize(outobj, outpos, requiredsize))
8694 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008695 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 outstart[(*outpos)++] = (char)res;
8697 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008698 }
8699
8700 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008703 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 Py_DECREF(rep);
8705 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008706 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 if (PyLong_Check(rep)) {
8708 Py_ssize_t requiredsize = *outpos+1;
8709 if (outsize<requiredsize)
8710 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8711 Py_DECREF(rep);
8712 return enc_EXCEPTION;
8713 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008714 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008716 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 else {
8718 const char *repchars = PyBytes_AS_STRING(rep);
8719 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8720 Py_ssize_t requiredsize = *outpos+repsize;
8721 if (outsize<requiredsize)
8722 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8723 Py_DECREF(rep);
8724 return enc_EXCEPTION;
8725 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008726 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 memcpy(outstart + *outpos, repchars, repsize);
8728 *outpos += repsize;
8729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008731 Py_DECREF(rep);
8732 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733}
8734
8735/* handle an error in PyUnicode_EncodeCharmap
8736 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008737static int
8738charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008739 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008741 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008742 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743{
8744 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008745 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008746 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008747 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008748 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008749 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008751 Py_ssize_t collstartpos = *inpos;
8752 Py_ssize_t collendpos = *inpos+1;
8753 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008754 const char *encoding = "charmap";
8755 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008756 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008757 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008758 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759
Benjamin Petersonbac79492012-01-14 13:34:47 -05008760 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008761 return -1;
8762 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 /* find all unencodable characters */
8764 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008765 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008766 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008767 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008768 val = encoding_map_lookup(ch, mapping);
8769 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 break;
8771 ++collendpos;
8772 continue;
8773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008774
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008775 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8776 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 if (rep==NULL)
8778 return -1;
8779 else if (rep!=Py_None) {
8780 Py_DECREF(rep);
8781 break;
8782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008783 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 }
8786 /* cache callback name lookup
8787 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008788 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008789 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008790
8791 switch (*error_handler) {
8792 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008793 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008794 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008795
8796 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008797 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 x = charmapencode_output('?', mapping, res, respos);
8799 if (x==enc_EXCEPTION) {
8800 return -1;
8801 }
8802 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008803 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 return -1;
8805 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008806 }
8807 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008808 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008809 *inpos = collendpos;
8810 break;
Victor Stinner50149202015-09-22 00:26:54 +02008811
8812 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008813 /* generate replacement (temporarily (mis)uses p) */
8814 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 char buffer[2+29+1+1];
8816 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008817 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 for (cp = buffer; *cp; ++cp) {
8819 x = charmapencode_output(*cp, mapping, res, respos);
8820 if (x==enc_EXCEPTION)
8821 return -1;
8822 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008823 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 return -1;
8825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008826 }
8827 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008828 *inpos = collendpos;
8829 break;
Victor Stinner50149202015-09-22 00:26:54 +02008830
Benjamin Peterson14339b62009-01-31 16:36:08 +00008831 default:
Victor Stinner50149202015-09-22 00:26:54 +02008832 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008833 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008835 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008837 if (PyBytes_Check(repunicode)) {
8838 /* Directly copy bytes result to output. */
8839 Py_ssize_t outsize = PyBytes_Size(*res);
8840 Py_ssize_t requiredsize;
8841 repsize = PyBytes_Size(repunicode);
8842 requiredsize = *respos + repsize;
8843 if (requiredsize > outsize)
8844 /* Make room for all additional bytes. */
8845 if (charmapencode_resize(res, respos, requiredsize)) {
8846 Py_DECREF(repunicode);
8847 return -1;
8848 }
8849 memcpy(PyBytes_AsString(*res) + *respos,
8850 PyBytes_AsString(repunicode), repsize);
8851 *respos += repsize;
8852 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008853 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008854 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008855 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008856 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008857 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008858 Py_DECREF(repunicode);
8859 return -1;
8860 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008861 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008862 data = PyUnicode_DATA(repunicode);
8863 kind = PyUnicode_KIND(repunicode);
8864 for (index = 0; index < repsize; index++) {
8865 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8866 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008867 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008868 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 return -1;
8870 }
8871 else if (x==enc_FAILED) {
8872 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008873 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 return -1;
8875 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008876 }
8877 *inpos = newpos;
8878 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008879 }
8880 return 0;
8881}
8882
Alexander Belopolsky40018472011-02-26 01:02:56 +00008883PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008884_PyUnicode_EncodeCharmap(PyObject *unicode,
8885 PyObject *mapping,
8886 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008888 /* output object */
8889 PyObject *res = NULL;
8890 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008891 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008892 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008894 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008895 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008896 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008897 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008898 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008899 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008900
Benjamin Petersonbac79492012-01-14 13:34:47 -05008901 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008902 return NULL;
8903 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008904 data = PyUnicode_DATA(unicode);
8905 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008906
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 /* Default to Latin-1 */
8908 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008909 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008911 /* allocate enough for a simple encoding without
8912 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008913 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008914 if (res == NULL)
8915 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008916 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008919 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008920 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008922 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 if (x==enc_EXCEPTION) /* error */
8924 goto onError;
8925 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008926 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008928 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 &res, &respos)) {
8930 goto onError;
8931 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008932 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 else
8934 /* done with this character => adjust input position */
8935 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008939 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008940 if (_PyBytes_Resize(&res, respos) < 0)
8941 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008942
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008943 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008944 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945 return res;
8946
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948 Py_XDECREF(res);
8949 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008950 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 return NULL;
8952}
8953
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008954/* Deprecated */
8955PyObject *
8956PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8957 Py_ssize_t size,
8958 PyObject *mapping,
8959 const char *errors)
8960{
8961 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008962 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008963 if (unicode == NULL)
8964 return NULL;
8965 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8966 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008967 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008968}
8969
Alexander Belopolsky40018472011-02-26 01:02:56 +00008970PyObject *
8971PyUnicode_AsCharmapString(PyObject *unicode,
8972 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973{
8974 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 PyErr_BadArgument();
8976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008978 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979}
8980
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008981/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008982static void
8983make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008985 Py_ssize_t startpos, Py_ssize_t endpos,
8986 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008988 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 *exceptionObject = _PyUnicodeTranslateError_Create(
8990 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 }
8992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8994 goto onError;
8995 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8996 goto onError;
8997 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8998 goto onError;
8999 return;
9000 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009001 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
9003}
9004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009005/* error handling callback helper:
9006 build arguments, call the callback and check the arguments,
9007 put the result into newpos and return the replacement string, which
9008 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009009static PyObject *
9010unicode_translate_call_errorhandler(const char *errors,
9011 PyObject **errorHandler,
9012 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009014 Py_ssize_t startpos, Py_ssize_t endpos,
9015 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009016{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009017 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009018
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009019 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009020 PyObject *restuple;
9021 PyObject *resunicode;
9022
9023 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009025 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027 }
9028
9029 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009031 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033
Petr Viktorinffd97532020-02-11 17:46:57 +01009034 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009035 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009037 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009038 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 Py_DECREF(restuple);
9040 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009041 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009042 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 &resunicode, &i_newpos)) {
9044 Py_DECREF(restuple);
9045 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009046 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009047 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009049 else
9050 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009052 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 Py_DECREF(restuple);
9054 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009055 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009056 Py_INCREF(resunicode);
9057 Py_DECREF(restuple);
9058 return resunicode;
9059}
9060
9061/* Lookup the character ch in the mapping and put the result in result,
9062 which must be decrefed by the caller.
9063 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009064static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009066{
Christian Heimes217cfd12007-12-02 14:31:20 +00009067 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009068 PyObject *x;
9069
9070 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009072 x = PyObject_GetItem(mapping, w);
9073 Py_DECREF(w);
9074 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9076 /* No mapping found means: use 1:1 mapping. */
9077 PyErr_Clear();
9078 *result = NULL;
9079 return 0;
9080 } else
9081 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009082 }
9083 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 *result = x;
9085 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009086 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009087 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009089 if (value < 0 || value > MAX_UNICODE) {
9090 PyErr_Format(PyExc_ValueError,
9091 "character mapping must be in range(0x%x)",
9092 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 Py_DECREF(x);
9094 return -1;
9095 }
9096 *result = x;
9097 return 0;
9098 }
9099 else if (PyUnicode_Check(x)) {
9100 *result = x;
9101 return 0;
9102 }
9103 else {
9104 /* wrong return value */
9105 PyErr_SetString(PyExc_TypeError,
9106 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009107 Py_DECREF(x);
9108 return -1;
9109 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009110}
Victor Stinner1194ea02014-04-04 19:37:40 +02009111
9112/* lookup the character, write the result into the writer.
9113 Return 1 if the result was written into the writer, return 0 if the mapping
9114 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009115static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009116charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9117 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009118{
Victor Stinner1194ea02014-04-04 19:37:40 +02009119 PyObject *item;
9120
9121 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009123
9124 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009126 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009129 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009130 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009131
9132 if (item == Py_None) {
9133 Py_DECREF(item);
9134 return 0;
9135 }
9136
9137 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009138 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9139 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9140 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009141 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9142 Py_DECREF(item);
9143 return -1;
9144 }
9145 Py_DECREF(item);
9146 return 1;
9147 }
9148
9149 if (!PyUnicode_Check(item)) {
9150 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009152 }
9153
9154 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9155 Py_DECREF(item);
9156 return -1;
9157 }
9158
9159 Py_DECREF(item);
9160 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009161}
9162
Victor Stinner89a76ab2014-04-05 11:44:04 +02009163static int
9164unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9165 Py_UCS1 *translate)
9166{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009167 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009168 int ret = 0;
9169
Victor Stinner89a76ab2014-04-05 11:44:04 +02009170 if (charmaptranslate_lookup(ch, mapping, &item)) {
9171 return -1;
9172 }
9173
9174 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009175 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009176 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009177 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009178 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009179 /* not found => default to 1:1 mapping */
9180 translate[ch] = ch;
9181 return 1;
9182 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009183 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009184 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009185 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9186 used it */
9187 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009188 /* invalid character or character outside ASCII:
9189 skip the fast translate */
9190 goto exit;
9191 }
9192 translate[ch] = (Py_UCS1)replace;
9193 }
9194 else if (PyUnicode_Check(item)) {
9195 Py_UCS4 replace;
9196
9197 if (PyUnicode_READY(item) == -1) {
9198 Py_DECREF(item);
9199 return -1;
9200 }
9201 if (PyUnicode_GET_LENGTH(item) != 1)
9202 goto exit;
9203
9204 replace = PyUnicode_READ_CHAR(item, 0);
9205 if (replace > 127)
9206 goto exit;
9207 translate[ch] = (Py_UCS1)replace;
9208 }
9209 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009210 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009211 goto exit;
9212 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009213 ret = 1;
9214
Benjamin Peterson1365de72014-04-07 20:15:41 -04009215 exit:
9216 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009217 return ret;
9218}
9219
9220/* Fast path for ascii => ascii translation. Return 1 if the whole string
9221 was translated into writer, return 0 if the input string was partially
9222 translated into writer, raise an exception and return -1 on error. */
9223static int
9224unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009225 _PyUnicodeWriter *writer, int ignore,
9226 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009227{
Victor Stinner872b2912014-04-05 14:27:07 +02009228 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009229 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009230 const Py_UCS1 *in, *end;
9231 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009232 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009233
Victor Stinner89a76ab2014-04-05 11:44:04 +02009234 len = PyUnicode_GET_LENGTH(input);
9235
Victor Stinner872b2912014-04-05 14:27:07 +02009236 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009237
9238 in = PyUnicode_1BYTE_DATA(input);
9239 end = in + len;
9240
9241 assert(PyUnicode_IS_ASCII(writer->buffer));
9242 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9243 out = PyUnicode_1BYTE_DATA(writer->buffer);
9244
Victor Stinner872b2912014-04-05 14:27:07 +02009245 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009246 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009247 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009248 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009249 int translate = unicode_fast_translate_lookup(mapping, ch,
9250 ascii_table);
9251 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009252 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009253 if (translate == 0)
9254 goto exit;
9255 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009256 }
Victor Stinner872b2912014-04-05 14:27:07 +02009257 if (ch2 == 0xfe) {
9258 if (ignore)
9259 continue;
9260 goto exit;
9261 }
9262 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009263 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009264 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009265 }
Victor Stinner872b2912014-04-05 14:27:07 +02009266 res = 1;
9267
9268exit:
9269 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009270 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009271 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009272}
9273
Victor Stinner3222da22015-10-01 22:07:32 +02009274static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275_PyUnicode_TranslateCharmap(PyObject *input,
9276 PyObject *mapping,
9277 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009280 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 Py_ssize_t size, i;
9282 int kind;
9283 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009284 _PyUnicodeWriter writer;
9285 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009286 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009287 PyObject *errorHandler = NULL;
9288 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009289 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009290 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009291
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 PyErr_BadArgument();
9294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 if (PyUnicode_READY(input) == -1)
9298 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009299 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 kind = PyUnicode_KIND(input);
9301 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009303 if (size == 0)
9304 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009306 /* allocate enough for a simple 1:1 translation without
9307 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009308 _PyUnicodeWriter_Init(&writer);
9309 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311
Victor Stinner872b2912014-04-05 14:27:07 +02009312 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9313
Victor Stinner33798672016-03-01 21:59:58 +01009314 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009315 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009316 if (PyUnicode_IS_ASCII(input)) {
9317 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9318 if (res < 0) {
9319 _PyUnicodeWriter_Dealloc(&writer);
9320 return NULL;
9321 }
9322 if (res == 1)
9323 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009324 }
Victor Stinner33798672016-03-01 21:59:58 +01009325 else {
9326 i = 0;
9327 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009331 int translate;
9332 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9333 Py_ssize_t newpos;
9334 /* startpos for collecting untranslatable chars */
9335 Py_ssize_t collstart;
9336 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009337 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338
Victor Stinner1194ea02014-04-04 19:37:40 +02009339 ch = PyUnicode_READ(kind, data, i);
9340 translate = charmaptranslate_output(ch, mapping, &writer);
9341 if (translate < 0)
9342 goto onError;
9343
9344 if (translate != 0) {
9345 /* it worked => adjust input pointer */
9346 ++i;
9347 continue;
9348 }
9349
9350 /* untranslatable character */
9351 collstart = i;
9352 collend = i+1;
9353
9354 /* find all untranslatable characters */
9355 while (collend < size) {
9356 PyObject *x;
9357 ch = PyUnicode_READ(kind, data, collend);
9358 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009359 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009360 Py_XDECREF(x);
9361 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009363 ++collend;
9364 }
9365
9366 if (ignore) {
9367 i = collend;
9368 }
9369 else {
9370 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9371 reason, input, &exc,
9372 collstart, collend, &newpos);
9373 if (repunicode == NULL)
9374 goto onError;
9375 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009377 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009378 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009379 Py_DECREF(repunicode);
9380 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009381 }
9382 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009383 Py_XDECREF(exc);
9384 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009385 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009388 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009389 Py_XDECREF(exc);
9390 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 return NULL;
9392}
9393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394/* Deprecated. Use PyUnicode_Translate instead. */
9395PyObject *
9396PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9397 Py_ssize_t size,
9398 PyObject *mapping,
9399 const char *errors)
9400{
Christian Heimes5f520f42012-09-11 14:03:25 +02009401 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009402 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 if (!unicode)
9404 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009405 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9406 Py_DECREF(unicode);
9407 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408}
9409
Alexander Belopolsky40018472011-02-26 01:02:56 +00009410PyObject *
9411PyUnicode_Translate(PyObject *str,
9412 PyObject *mapping,
9413 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009415 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009416 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009417 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418}
Tim Petersced69f82003-09-16 20:30:58 +00009419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420PyObject *
9421_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9422{
9423 if (!PyUnicode_Check(unicode)) {
9424 PyErr_BadInternalCall();
9425 return NULL;
9426 }
9427 if (PyUnicode_READY(unicode) == -1)
9428 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009429 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 /* If the string is already ASCII, just return the same string */
9431 Py_INCREF(unicode);
9432 return unicode;
9433 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009434
9435 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9436 PyObject *result = PyUnicode_New(len, 127);
9437 if (result == NULL) {
9438 return NULL;
9439 }
9440
9441 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9442 int kind = PyUnicode_KIND(unicode);
9443 const void *data = PyUnicode_DATA(unicode);
9444 Py_ssize_t i;
9445 for (i = 0; i < len; ++i) {
9446 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9447 if (ch < 127) {
9448 out[i] = ch;
9449 }
9450 else if (Py_UNICODE_ISSPACE(ch)) {
9451 out[i] = ' ';
9452 }
9453 else {
9454 int decimal = Py_UNICODE_TODECIMAL(ch);
9455 if (decimal < 0) {
9456 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009457 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009458 _PyUnicode_LENGTH(result) = i + 1;
9459 break;
9460 }
9461 out[i] = '0' + decimal;
9462 }
9463 }
9464
INADA Naoki16dfca42018-07-14 12:06:43 +09009465 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009466 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467}
9468
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009469PyObject *
9470PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9471 Py_ssize_t length)
9472{
Victor Stinnerf0124502011-11-21 23:12:56 +01009473 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009474 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009475 Py_UCS4 maxchar;
9476 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009477 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009478
Victor Stinner99d7ad02012-02-22 13:37:39 +01009479 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009480 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009481 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009482 if (ch > 127) {
9483 int decimal = Py_UNICODE_TODECIMAL(ch);
9484 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009485 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009486 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009487 }
9488 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009489
9490 /* Copy to a new string */
9491 decimal = PyUnicode_New(length, maxchar);
9492 if (decimal == NULL)
9493 return decimal;
9494 kind = PyUnicode_KIND(decimal);
9495 data = PyUnicode_DATA(decimal);
9496 /* Iterate over code points */
9497 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009498 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009499 if (ch > 127) {
9500 int decimal = Py_UNICODE_TODECIMAL(ch);
9501 if (decimal >= 0)
9502 ch = '0' + decimal;
9503 }
9504 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009506 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009507}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009508/* --- Decimal Encoder ---------------------------------------------------- */
9509
Alexander Belopolsky40018472011-02-26 01:02:56 +00009510int
9511PyUnicode_EncodeDecimal(Py_UNICODE *s,
9512 Py_ssize_t length,
9513 char *output,
9514 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009515{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009516 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009517 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009518 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009519 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009520
9521 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 PyErr_BadArgument();
9523 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009524 }
9525
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009526 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009527 if (unicode == NULL)
9528 return -1;
9529
Victor Stinner42bf7752011-11-21 22:52:58 +01009530 kind = PyUnicode_KIND(unicode);
9531 data = PyUnicode_DATA(unicode);
9532
Victor Stinnerb84d7232011-11-22 01:50:07 +01009533 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009534 PyObject *exc;
9535 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009536 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009537 Py_ssize_t startpos;
9538
9539 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009540
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009542 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009543 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009545 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 decimal = Py_UNICODE_TODECIMAL(ch);
9547 if (decimal >= 0) {
9548 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009549 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 continue;
9551 }
9552 if (0 < ch && ch < 256) {
9553 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009554 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 continue;
9556 }
Victor Stinner6345be92011-11-25 20:09:01 +01009557
Victor Stinner42bf7752011-11-21 22:52:58 +01009558 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009559 exc = NULL;
9560 raise_encode_exception(&exc, "decimal", unicode,
9561 startpos, startpos+1,
9562 "invalid decimal Unicode string");
9563 Py_XDECREF(exc);
9564 Py_DECREF(unicode);
9565 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009566 }
9567 /* 0-terminate the output string */
9568 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009569 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009570 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009571}
9572
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573/* --- Helpers ------------------------------------------------------------ */
9574
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009575/* helper macro to fixup start/end slice values */
9576#define ADJUST_INDICES(start, end, len) \
9577 if (end > len) \
9578 end = len; \
9579 else if (end < 0) { \
9580 end += len; \
9581 if (end < 0) \
9582 end = 0; \
9583 } \
9584 if (start < 0) { \
9585 start += len; \
9586 if (start < 0) \
9587 start = 0; \
9588 }
9589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009591any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 Py_ssize_t end,
9594 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009596 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009597 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 Py_ssize_t len1, len2, result;
9599
9600 kind1 = PyUnicode_KIND(s1);
9601 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009602 if (kind1 < kind2)
9603 return -1;
9604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 len1 = PyUnicode_GET_LENGTH(s1);
9606 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009607 ADJUST_INDICES(start, end, len1);
9608 if (end - start < len2)
9609 return -1;
9610
9611 buf1 = PyUnicode_DATA(s1);
9612 buf2 = PyUnicode_DATA(s2);
9613 if (len2 == 1) {
9614 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9615 result = findchar((const char *)buf1 + kind1*start,
9616 kind1, end - start, ch, direction);
9617 if (result == -1)
9618 return -1;
9619 else
9620 return start + result;
9621 }
9622
9623 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009624 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009625 if (!buf2)
9626 return -2;
9627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628
Victor Stinner794d5672011-10-10 03:21:36 +02009629 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009630 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009631 case PyUnicode_1BYTE_KIND:
9632 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9633 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9634 else
9635 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9636 break;
9637 case PyUnicode_2BYTE_KIND:
9638 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9639 break;
9640 case PyUnicode_4BYTE_KIND:
9641 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9642 break;
9643 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009644 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009645 }
9646 }
9647 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009648 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009649 case PyUnicode_1BYTE_KIND:
9650 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9651 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9652 else
9653 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9654 break;
9655 case PyUnicode_2BYTE_KIND:
9656 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9657 break;
9658 case PyUnicode_4BYTE_KIND:
9659 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9660 break;
9661 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009662 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 }
9665
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009666 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009667 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009668 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669
9670 return result;
9671}
9672
Victor Stinner59423e32018-11-26 13:40:01 +01009673/* _PyUnicode_InsertThousandsGrouping() helper functions */
9674#include "stringlib/localeutil.h"
9675
9676/**
9677 * InsertThousandsGrouping:
9678 * @writer: Unicode writer.
9679 * @n_buffer: Number of characters in @buffer.
9680 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9681 * @d_pos: Start of digits string.
9682 * @n_digits: The number of digits in the string, in which we want
9683 * to put the grouping chars.
9684 * @min_width: The minimum width of the digits in the output string.
9685 * Output will be zero-padded on the left to fill.
9686 * @grouping: see definition in localeconv().
9687 * @thousands_sep: see definition in localeconv().
9688 *
9689 * There are 2 modes: counting and filling. If @writer is NULL,
9690 * we are in counting mode, else filling mode.
9691 * If counting, the required buffer size is returned.
9692 * If filling, we know the buffer will be large enough, so we don't
9693 * need to pass in the buffer size.
9694 * Inserts thousand grouping characters (as defined by grouping and
9695 * thousands_sep) into @writer.
9696 *
9697 * Return value: -1 on error, number of characters otherwise.
9698 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009700_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009701 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009702 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009703 PyObject *digits,
9704 Py_ssize_t d_pos,
9705 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009706 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009707 const char *grouping,
9708 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009709 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710{
Xtreak3f7983a2019-01-07 20:39:14 +05309711 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009712 if (writer) {
9713 assert(digits != NULL);
9714 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009715 }
9716 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009717 assert(digits == NULL);
9718 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009719 }
Victor Stinner59423e32018-11-26 13:40:01 +01009720 assert(0 <= d_pos);
9721 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009722 assert(grouping != NULL);
9723
9724 if (digits != NULL) {
9725 if (PyUnicode_READY(digits) == -1) {
9726 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009727 }
Victor Stinner59423e32018-11-26 13:40:01 +01009728 }
9729 if (PyUnicode_READY(thousands_sep) == -1) {
9730 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009731 }
9732
Victor Stinner59423e32018-11-26 13:40:01 +01009733 Py_ssize_t count = 0;
9734 Py_ssize_t n_zeros;
9735 int loop_broken = 0;
9736 int use_separator = 0; /* First time through, don't append the
9737 separator. They only go between
9738 groups. */
9739 Py_ssize_t buffer_pos;
9740 Py_ssize_t digits_pos;
9741 Py_ssize_t len;
9742 Py_ssize_t n_chars;
9743 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9744 be looked at */
9745 /* A generator that returns all of the grouping widths, until it
9746 returns 0. */
9747 GroupGenerator groupgen;
9748 GroupGenerator_init(&groupgen, grouping);
9749 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9750
9751 /* if digits are not grouped, thousands separator
9752 should be an empty string */
9753 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9754
9755 digits_pos = d_pos + n_digits;
9756 if (writer) {
9757 buffer_pos = writer->pos + n_buffer;
9758 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9759 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 }
Victor Stinner59423e32018-11-26 13:40:01 +01009761 else {
9762 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009763 }
Victor Stinner59423e32018-11-26 13:40:01 +01009764
9765 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009766 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009767 }
Victor Stinner59423e32018-11-26 13:40:01 +01009768
9769 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9770 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9771 n_zeros = Py_MAX(0, len - remaining);
9772 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9773
9774 /* Use n_zero zero's and n_chars chars */
9775
9776 /* Count only, don't do anything. */
9777 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9778
9779 /* Copy into the writer. */
9780 InsertThousandsGrouping_fill(writer, &buffer_pos,
9781 digits, &digits_pos,
9782 n_chars, n_zeros,
9783 use_separator ? thousands_sep : NULL,
9784 thousands_sep_len, maxchar);
9785
9786 /* Use a separator next time. */
9787 use_separator = 1;
9788
9789 remaining -= n_chars;
9790 min_width -= len;
9791
9792 if (remaining <= 0 && min_width <= 0) {
9793 loop_broken = 1;
9794 break;
9795 }
9796 min_width -= thousands_sep_len;
9797 }
9798 if (!loop_broken) {
9799 /* We left the loop without using a break statement. */
9800
9801 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9802 n_zeros = Py_MAX(0, len - remaining);
9803 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9804
9805 /* Use n_zero zero's and n_chars chars */
9806 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9807
9808 /* Copy into the writer. */
9809 InsertThousandsGrouping_fill(writer, &buffer_pos,
9810 digits, &digits_pos,
9811 n_chars, n_zeros,
9812 use_separator ? thousands_sep : NULL,
9813 thousands_sep_len, maxchar);
9814 }
9815 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816}
9817
9818
Alexander Belopolsky40018472011-02-26 01:02:56 +00009819Py_ssize_t
9820PyUnicode_Count(PyObject *str,
9821 PyObject *substr,
9822 Py_ssize_t start,
9823 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009825 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009826 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009827 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009829
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009830 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009831 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009832
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009833 kind1 = PyUnicode_KIND(str);
9834 kind2 = PyUnicode_KIND(substr);
9835 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009836 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009837
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009838 len1 = PyUnicode_GET_LENGTH(str);
9839 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009841 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009842 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009843
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009844 buf1 = PyUnicode_DATA(str);
9845 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009846 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009847 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009848 if (!buf2)
9849 goto onError;
9850 }
9851
9852 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009854 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009855 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009856 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009857 buf2, len2, PY_SSIZE_T_MAX
9858 );
9859 else
9860 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009861 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009862 buf2, len2, PY_SSIZE_T_MAX
9863 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 break;
9865 case PyUnicode_2BYTE_KIND:
9866 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009867 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 buf2, len2, PY_SSIZE_T_MAX
9869 );
9870 break;
9871 case PyUnicode_4BYTE_KIND:
9872 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009873 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 buf2, len2, PY_SSIZE_T_MAX
9875 );
9876 break;
9877 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009878 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009880
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009881 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009882 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009883 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009887 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9888 if (kind2 != kind1)
9889 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891}
9892
Alexander Belopolsky40018472011-02-26 01:02:56 +00009893Py_ssize_t
9894PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009895 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009896 Py_ssize_t start,
9897 Py_ssize_t end,
9898 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009900 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009901 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009902
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009903 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904}
9905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906Py_ssize_t
9907PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9908 Py_ssize_t start, Py_ssize_t end,
9909 int direction)
9910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009912 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 if (PyUnicode_READY(str) == -1)
9914 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009915 len = PyUnicode_GET_LENGTH(str);
9916 ADJUST_INDICES(start, end, len);
9917 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009918 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009920 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9921 kind, end-start, ch, direction);
9922 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009924 else
9925 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926}
9927
Alexander Belopolsky40018472011-02-26 01:02:56 +00009928static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009929tailmatch(PyObject *self,
9930 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009931 Py_ssize_t start,
9932 Py_ssize_t end,
9933 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 int kind_self;
9936 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009937 const void *data_self;
9938 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 Py_ssize_t offset;
9940 Py_ssize_t i;
9941 Py_ssize_t end_sub;
9942
9943 if (PyUnicode_READY(self) == -1 ||
9944 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009945 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9948 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009950 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009952 if (PyUnicode_GET_LENGTH(substring) == 0)
9953 return 1;
9954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 kind_self = PyUnicode_KIND(self);
9956 data_self = PyUnicode_DATA(self);
9957 kind_sub = PyUnicode_KIND(substring);
9958 data_sub = PyUnicode_DATA(substring);
9959 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9960
9961 if (direction > 0)
9962 offset = end;
9963 else
9964 offset = start;
9965
9966 if (PyUnicode_READ(kind_self, data_self, offset) ==
9967 PyUnicode_READ(kind_sub, data_sub, 0) &&
9968 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9969 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9970 /* If both are of the same kind, memcmp is sufficient */
9971 if (kind_self == kind_sub) {
9972 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009973 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 data_sub,
9975 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009976 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009978 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 else {
9980 /* We do not need to compare 0 and len(substring)-1 because
9981 the if statement above ensured already that they are equal
9982 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 for (i = 1; i < end_sub; ++i) {
9984 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9985 PyUnicode_READ(kind_sub, data_sub, i))
9986 return 0;
9987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 }
9991
9992 return 0;
9993}
9994
Alexander Belopolsky40018472011-02-26 01:02:56 +00009995Py_ssize_t
9996PyUnicode_Tailmatch(PyObject *str,
9997 PyObject *substr,
9998 Py_ssize_t start,
9999 Py_ssize_t end,
10000 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010002 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010004
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010005 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006}
10007
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010008static PyObject *
10009ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010011 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010012 const char *data = PyUnicode_DATA(self);
10013 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010014 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010015
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010016 res = PyUnicode_New(len, 127);
10017 if (res == NULL)
10018 return NULL;
10019 resdata = PyUnicode_DATA(res);
10020 if (lower)
10021 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010023 _Py_bytes_upper(resdata, data, len);
10024 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025}
10026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010028handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010030 Py_ssize_t j;
10031 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010032 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010033 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010034
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010035 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10036
10037 where ! is a negation and \p{xxx} is a character with property xxx.
10038 */
10039 for (j = i - 1; j >= 0; j--) {
10040 c = PyUnicode_READ(kind, data, j);
10041 if (!_PyUnicode_IsCaseIgnorable(c))
10042 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010044 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10045 if (final_sigma) {
10046 for (j = i + 1; j < length; j++) {
10047 c = PyUnicode_READ(kind, data, j);
10048 if (!_PyUnicode_IsCaseIgnorable(c))
10049 break;
10050 }
10051 final_sigma = j == length || !_PyUnicode_IsCased(c);
10052 }
10053 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010054}
10055
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010057lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010058 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010060 /* Obscure special case. */
10061 if (c == 0x3A3) {
10062 mapped[0] = handle_capital_sigma(kind, data, length, i);
10063 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066}
10067
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010068static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010069do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010071 Py_ssize_t i, k = 0;
10072 int n_res, j;
10073 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010074
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010075 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010076 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010078 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010079 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010081 for (i = 1; i < length; i++) {
10082 c = PyUnicode_READ(kind, data, i);
10083 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10084 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010085 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010087 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010088 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010089 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090}
10091
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010092static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010093do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010094 Py_ssize_t i, k = 0;
10095
10096 for (i = 0; i < length; i++) {
10097 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10098 int n_res, j;
10099 if (Py_UNICODE_ISUPPER(c)) {
10100 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10101 }
10102 else if (Py_UNICODE_ISLOWER(c)) {
10103 n_res = _PyUnicode_ToUpperFull(c, mapped);
10104 }
10105 else {
10106 n_res = 1;
10107 mapped[0] = c;
10108 }
10109 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010110 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010111 res[k++] = mapped[j];
10112 }
10113 }
10114 return k;
10115}
10116
10117static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010118do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010119 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010121 Py_ssize_t i, k = 0;
10122
10123 for (i = 0; i < length; i++) {
10124 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10125 int n_res, j;
10126 if (lower)
10127 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10128 else
10129 n_res = _PyUnicode_ToUpperFull(c, mapped);
10130 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010131 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010132 res[k++] = mapped[j];
10133 }
10134 }
10135 return k;
10136}
10137
10138static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010139do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010140{
10141 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10142}
10143
10144static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010145do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010146{
10147 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10148}
10149
Benjamin Petersone51757f2012-01-12 21:10:29 -050010150static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010151do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010152{
10153 Py_ssize_t i, k = 0;
10154
10155 for (i = 0; i < length; i++) {
10156 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10157 Py_UCS4 mapped[3];
10158 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10159 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010160 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010161 res[k++] = mapped[j];
10162 }
10163 }
10164 return k;
10165}
10166
10167static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010168do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010169{
10170 Py_ssize_t i, k = 0;
10171 int previous_is_cased;
10172
10173 previous_is_cased = 0;
10174 for (i = 0; i < length; i++) {
10175 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10176 Py_UCS4 mapped[3];
10177 int n_res, j;
10178
10179 if (previous_is_cased)
10180 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10181 else
10182 n_res = _PyUnicode_ToTitleFull(c, mapped);
10183
10184 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010185 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010186 res[k++] = mapped[j];
10187 }
10188
10189 previous_is_cased = _PyUnicode_IsCased(c);
10190 }
10191 return k;
10192}
10193
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010194static PyObject *
10195case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010196 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010197{
10198 PyObject *res = NULL;
10199 Py_ssize_t length, newlength = 0;
10200 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010201 const void *data;
10202 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010203 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10204
Benjamin Petersoneea48462012-01-16 14:28:50 -050010205 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010206
10207 kind = PyUnicode_KIND(self);
10208 data = PyUnicode_DATA(self);
10209 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010210 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010211 PyErr_SetString(PyExc_OverflowError, "string is too long");
10212 return NULL;
10213 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010214 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010215 if (tmp == NULL)
10216 return PyErr_NoMemory();
10217 newlength = perform(kind, data, length, tmp, &maxchar);
10218 res = PyUnicode_New(newlength, maxchar);
10219 if (res == NULL)
10220 goto leave;
10221 tmpend = tmp + newlength;
10222 outdata = PyUnicode_DATA(res);
10223 outkind = PyUnicode_KIND(res);
10224 switch (outkind) {
10225 case PyUnicode_1BYTE_KIND:
10226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10227 break;
10228 case PyUnicode_2BYTE_KIND:
10229 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10230 break;
10231 case PyUnicode_4BYTE_KIND:
10232 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10233 break;
10234 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010235 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010236 }
10237 leave:
10238 PyMem_FREE(tmp);
10239 return res;
10240}
10241
Tim Peters8ce9f162004-08-27 01:49:32 +000010242PyObject *
10243PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010245 PyObject *res;
10246 PyObject *fseq;
10247 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010248 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010250 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010251 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010252 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010253 }
10254
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010255 /* NOTE: the following code can't call back into Python code,
10256 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010257 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010258
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010259 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010260 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010261 res = _PyUnicode_JoinArray(separator, items, seqlen);
10262 Py_DECREF(fseq);
10263 return res;
10264}
10265
10266PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010267_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010268{
10269 PyObject *res = NULL; /* the result */
10270 PyObject *sep = NULL;
10271 Py_ssize_t seplen;
10272 PyObject *item;
10273 Py_ssize_t sz, i, res_offset;
10274 Py_UCS4 maxchar;
10275 Py_UCS4 item_maxchar;
10276 int use_memcpy;
10277 unsigned char *res_data = NULL, *sep_data = NULL;
10278 PyObject *last_obj;
10279 unsigned int kind = 0;
10280
Tim Peters05eba1f2004-08-27 21:32:02 +000010281 /* If empty sequence, return u"". */
10282 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010283 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010284 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010285
Tim Peters05eba1f2004-08-27 21:32:02 +000010286 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010287 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010288 if (seqlen == 1) {
10289 if (PyUnicode_CheckExact(items[0])) {
10290 res = items[0];
10291 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010292 return res;
10293 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010294 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010295 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010296 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010297 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010298 /* Set up sep and seplen */
10299 if (separator == NULL) {
10300 /* fall back to a blank space separator */
10301 sep = PyUnicode_FromOrdinal(' ');
10302 if (!sep)
10303 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010304 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010305 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010306 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010307 else {
10308 if (!PyUnicode_Check(separator)) {
10309 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010310 "separator: expected str instance,"
10311 " %.80s found",
10312 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010313 goto onError;
10314 }
10315 if (PyUnicode_READY(separator))
10316 goto onError;
10317 sep = separator;
10318 seplen = PyUnicode_GET_LENGTH(separator);
10319 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10320 /* inc refcount to keep this code path symmetric with the
10321 above case of a blank separator */
10322 Py_INCREF(sep);
10323 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010324 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010325 }
10326
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010327 /* There are at least two things to join, or else we have a subclass
10328 * of str in the sequence.
10329 * Do a pre-pass to figure out the total amount of space we'll
10330 * need (sz), and see whether all argument are strings.
10331 */
10332 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010333#ifdef Py_DEBUG
10334 use_memcpy = 0;
10335#else
10336 use_memcpy = 1;
10337#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010338 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010339 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010340 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010341 if (!PyUnicode_Check(item)) {
10342 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010343 "sequence item %zd: expected str instance,"
10344 " %.80s found",
10345 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010346 goto onError;
10347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 if (PyUnicode_READY(item) == -1)
10349 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010350 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010352 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010353 if (i != 0) {
10354 add_sz += seplen;
10355 }
10356 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010357 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010359 goto onError;
10360 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010361 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010362 if (use_memcpy && last_obj != NULL) {
10363 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10364 use_memcpy = 0;
10365 }
10366 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010367 }
Tim Petersced69f82003-09-16 20:30:58 +000010368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010370 if (res == NULL)
10371 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010372
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010373 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010374#ifdef Py_DEBUG
10375 use_memcpy = 0;
10376#else
10377 if (use_memcpy) {
10378 res_data = PyUnicode_1BYTE_DATA(res);
10379 kind = PyUnicode_KIND(res);
10380 if (seplen != 0)
10381 sep_data = PyUnicode_1BYTE_DATA(sep);
10382 }
10383#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010384 if (use_memcpy) {
10385 for (i = 0; i < seqlen; ++i) {
10386 Py_ssize_t itemlen;
10387 item = items[i];
10388
10389 /* Copy item, and maybe the separator. */
10390 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010391 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010392 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010393 kind * seplen);
10394 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010395 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010396
10397 itemlen = PyUnicode_GET_LENGTH(item);
10398 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010399 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010400 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010401 kind * itemlen);
10402 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010403 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010404 }
10405 assert(res_data == PyUnicode_1BYTE_DATA(res)
10406 + kind * PyUnicode_GET_LENGTH(res));
10407 }
10408 else {
10409 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10410 Py_ssize_t itemlen;
10411 item = items[i];
10412
10413 /* Copy item, and maybe the separator. */
10414 if (i && seplen != 0) {
10415 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10416 res_offset += seplen;
10417 }
10418
10419 itemlen = PyUnicode_GET_LENGTH(item);
10420 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010421 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010422 res_offset += itemlen;
10423 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010424 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010425 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010426 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010429 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431
Benjamin Peterson29060642009-01-31 22:14:21 +000010432 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010434 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 return NULL;
10436}
10437
Victor Stinnerd3f08822012-05-29 12:57:52 +020010438void
10439_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10440 Py_UCS4 fill_char)
10441{
10442 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010443 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010444 assert(PyUnicode_IS_READY(unicode));
10445 assert(unicode_modifiable(unicode));
10446 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10447 assert(start >= 0);
10448 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010449 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010450}
10451
Victor Stinner3fe55312012-01-04 00:33:50 +010010452Py_ssize_t
10453PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10454 Py_UCS4 fill_char)
10455{
10456 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010457
10458 if (!PyUnicode_Check(unicode)) {
10459 PyErr_BadInternalCall();
10460 return -1;
10461 }
10462 if (PyUnicode_READY(unicode) == -1)
10463 return -1;
10464 if (unicode_check_modifiable(unicode))
10465 return -1;
10466
Victor Stinnerd3f08822012-05-29 12:57:52 +020010467 if (start < 0) {
10468 PyErr_SetString(PyExc_IndexError, "string index out of range");
10469 return -1;
10470 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010471 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10472 PyErr_SetString(PyExc_ValueError,
10473 "fill character is bigger than "
10474 "the string maximum character");
10475 return -1;
10476 }
10477
10478 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10479 length = Py_MIN(maxlen, length);
10480 if (length <= 0)
10481 return 0;
10482
Victor Stinnerd3f08822012-05-29 12:57:52 +020010483 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010484 return length;
10485}
10486
Victor Stinner9310abb2011-10-05 00:59:23 +020010487static PyObject *
10488pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010489 Py_ssize_t left,
10490 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 PyObject *u;
10494 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010495 int kind;
10496 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497
10498 if (left < 0)
10499 left = 0;
10500 if (right < 0)
10501 right = 0;
10502
Victor Stinnerc4b49542011-12-11 22:44:26 +010010503 if (left == 0 && right == 0)
10504 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10507 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010508 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10509 return NULL;
10510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010512 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010514 if (!u)
10515 return NULL;
10516
10517 kind = PyUnicode_KIND(u);
10518 data = PyUnicode_DATA(u);
10519 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010520 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010521 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010522 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010523 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010524 assert(_PyUnicode_CheckConsistency(u, 1));
10525 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526}
10527
Alexander Belopolsky40018472011-02-26 01:02:56 +000010528PyObject *
10529PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010533 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535
Benjamin Petersonead6b532011-12-20 17:23:42 -060010536 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010538 if (PyUnicode_IS_ASCII(string))
10539 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010540 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010541 PyUnicode_GET_LENGTH(string), keepends);
10542 else
10543 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010544 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010545 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 break;
10547 case PyUnicode_2BYTE_KIND:
10548 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010549 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 PyUnicode_GET_LENGTH(string), keepends);
10551 break;
10552 case PyUnicode_4BYTE_KIND:
10553 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010554 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 PyUnicode_GET_LENGTH(string), keepends);
10556 break;
10557 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010558 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561}
10562
Alexander Belopolsky40018472011-02-26 01:02:56 +000010563static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010564split(PyObject *self,
10565 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010566 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010568 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010569 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 Py_ssize_t len1, len2;
10571 PyObject* out;
10572
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010574 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (PyUnicode_READY(self) == -1)
10577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010580 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010582 if (PyUnicode_IS_ASCII(self))
10583 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010584 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010585 PyUnicode_GET_LENGTH(self), maxcount
10586 );
10587 else
10588 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010589 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 PyUnicode_GET_LENGTH(self), maxcount
10591 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 case PyUnicode_2BYTE_KIND:
10593 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010594 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 PyUnicode_GET_LENGTH(self), maxcount
10596 );
10597 case PyUnicode_4BYTE_KIND:
10598 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010599 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 PyUnicode_GET_LENGTH(self), maxcount
10601 );
10602 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010603 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 }
10605
10606 if (PyUnicode_READY(substring) == -1)
10607 return NULL;
10608
10609 kind1 = PyUnicode_KIND(self);
10610 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 len1 = PyUnicode_GET_LENGTH(self);
10612 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010613 if (kind1 < kind2 || len1 < len2) {
10614 out = PyList_New(1);
10615 if (out == NULL)
10616 return NULL;
10617 Py_INCREF(self);
10618 PyList_SET_ITEM(out, 0, self);
10619 return out;
10620 }
10621 buf1 = PyUnicode_DATA(self);
10622 buf2 = PyUnicode_DATA(substring);
10623 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010624 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 if (!buf2)
10626 return NULL;
10627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010629 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010631 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10632 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010633 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010634 else
10635 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010636 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 break;
10638 case PyUnicode_2BYTE_KIND:
10639 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010640 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 break;
10642 case PyUnicode_4BYTE_KIND:
10643 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010644 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 break;
10646 default:
10647 out = NULL;
10648 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010649 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010650 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010651 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653}
10654
Alexander Belopolsky40018472011-02-26 01:02:56 +000010655static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010656rsplit(PyObject *self,
10657 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010658 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010659{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010660 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010661 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 Py_ssize_t len1, len2;
10663 PyObject* out;
10664
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010665 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010666 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (PyUnicode_READY(self) == -1)
10669 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010672 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010674 if (PyUnicode_IS_ASCII(self))
10675 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010676 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010677 PyUnicode_GET_LENGTH(self), maxcount
10678 );
10679 else
10680 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010681 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010682 PyUnicode_GET_LENGTH(self), maxcount
10683 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 case PyUnicode_2BYTE_KIND:
10685 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010686 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 PyUnicode_GET_LENGTH(self), maxcount
10688 );
10689 case PyUnicode_4BYTE_KIND:
10690 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010691 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 PyUnicode_GET_LENGTH(self), maxcount
10693 );
10694 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010695 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 }
10697
10698 if (PyUnicode_READY(substring) == -1)
10699 return NULL;
10700
10701 kind1 = PyUnicode_KIND(self);
10702 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 len1 = PyUnicode_GET_LENGTH(self);
10704 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010705 if (kind1 < kind2 || len1 < len2) {
10706 out = PyList_New(1);
10707 if (out == NULL)
10708 return NULL;
10709 Py_INCREF(self);
10710 PyList_SET_ITEM(out, 0, self);
10711 return out;
10712 }
10713 buf1 = PyUnicode_DATA(self);
10714 buf2 = PyUnicode_DATA(substring);
10715 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010716 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010717 if (!buf2)
10718 return NULL;
10719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010721 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010723 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10724 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010725 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010726 else
10727 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010728 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 break;
10730 case PyUnicode_2BYTE_KIND:
10731 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010732 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 break;
10734 case PyUnicode_4BYTE_KIND:
10735 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010736 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 break;
10738 default:
10739 out = NULL;
10740 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010741 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010742 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010743 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 return out;
10745}
10746
10747static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010748anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10749 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010751 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010753 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10754 return asciilib_find(buf1, len1, buf2, len2, offset);
10755 else
10756 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 case PyUnicode_2BYTE_KIND:
10758 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10759 case PyUnicode_4BYTE_KIND:
10760 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10761 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010762 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763}
10764
10765static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010766anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10767 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010769 switch (kind) {
10770 case PyUnicode_1BYTE_KIND:
10771 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10772 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10773 else
10774 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10775 case PyUnicode_2BYTE_KIND:
10776 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10777 case PyUnicode_4BYTE_KIND:
10778 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10779 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010780 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010781}
10782
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010783static void
10784replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10785 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10786{
10787 int kind = PyUnicode_KIND(u);
10788 void *data = PyUnicode_DATA(u);
10789 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10790 if (kind == PyUnicode_1BYTE_KIND) {
10791 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10792 (Py_UCS1 *)data + len,
10793 u1, u2, maxcount);
10794 }
10795 else if (kind == PyUnicode_2BYTE_KIND) {
10796 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10797 (Py_UCS2 *)data + len,
10798 u1, u2, maxcount);
10799 }
10800 else {
10801 assert(kind == PyUnicode_4BYTE_KIND);
10802 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10803 (Py_UCS4 *)data + len,
10804 u1, u2, maxcount);
10805 }
10806}
10807
Alexander Belopolsky40018472011-02-26 01:02:56 +000010808static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809replace(PyObject *self, PyObject *str1,
10810 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010813 const char *sbuf = PyUnicode_DATA(self);
10814 const void *buf1 = PyUnicode_DATA(str1);
10815 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 int srelease = 0, release1 = 0, release2 = 0;
10817 int skind = PyUnicode_KIND(self);
10818 int kind1 = PyUnicode_KIND(str1);
10819 int kind2 = PyUnicode_KIND(str2);
10820 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10821 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10822 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010823 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010824 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010826 if (slen < len1)
10827 goto nothing;
10828
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010830 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010831 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010832 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833
Victor Stinner59de0ee2011-10-07 10:01:28 +020010834 if (str1 == str2)
10835 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836
Victor Stinner49a0a212011-10-12 23:46:10 +020010837 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010838 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10839 if (maxchar < maxchar_str1)
10840 /* substring too wide to be present */
10841 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010842 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10843 /* Replacing str1 with str2 may cause a maxchar reduction in the
10844 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010845 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010846 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010849 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010851 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010853 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010854 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010855 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010856
Victor Stinner69ed0f42013-04-09 21:48:24 +020010857 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010858 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010859 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010860 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010861 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010863 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010865
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010866 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10867 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010868 }
10869 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 int rkind = skind;
10871 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010872 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 if (kind1 < rkind) {
10875 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010876 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 if (!buf1) goto error;
10878 release1 = 1;
10879 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010880 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010881 if (i < 0)
10882 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 if (rkind > kind2) {
10884 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010885 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (!buf2) goto error;
10887 release2 = 1;
10888 }
10889 else if (rkind < kind2) {
10890 /* widen self and buf1 */
10891 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010892 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010893 assert(buf1 != PyUnicode_DATA(str1));
10894 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010895 buf1 = PyUnicode_DATA(str1);
10896 release1 = 0;
10897 }
10898 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 if (!sbuf) goto error;
10900 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010901 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (!buf1) goto error;
10903 release1 = 1;
10904 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010905 u = PyUnicode_New(slen, maxchar);
10906 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010908 assert(PyUnicode_KIND(u) == rkind);
10909 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010910
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010911 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010912 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010913 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010915 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010917
10918 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010919 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010920 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010921 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010922 if (i == -1)
10923 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010924 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010926 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010930 }
10931 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010933 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 int rkind = skind;
10935 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010938 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010939 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 if (!buf1) goto error;
10941 release1 = 1;
10942 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010943 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010944 if (n == 0)
10945 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010947 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010948 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 if (!buf2) goto error;
10950 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010953 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010955 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (!sbuf) goto error;
10957 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010958 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010959 assert(buf1 != PyUnicode_DATA(str1));
10960 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010961 buf1 = PyUnicode_DATA(str1);
10962 release1 = 0;
10963 }
10964 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 if (!buf1) goto error;
10966 release1 = 1;
10967 }
10968 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10969 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010970 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 PyErr_SetString(PyExc_OverflowError,
10972 "replace string is too long");
10973 goto error;
10974 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010975 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010976 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010977 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010978 goto done;
10979 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010980 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 PyErr_SetString(PyExc_OverflowError,
10982 "replace string is too long");
10983 goto error;
10984 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010985 u = PyUnicode_New(new_size, maxchar);
10986 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010988 assert(PyUnicode_KIND(u) == rkind);
10989 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 ires = i = 0;
10991 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010992 while (n-- > 0) {
10993 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010994 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010995 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010996 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010997 if (j == -1)
10998 break;
10999 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011000 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011001 memcpy(res + rkind * ires,
11002 sbuf + rkind * i,
11003 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005 }
11006 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011008 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011010 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011016 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011017 memcpy(res + rkind * ires,
11018 sbuf + rkind * i,
11019 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011020 }
11021 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011022 /* interleave */
11023 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011024 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011026 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011028 if (--n <= 0)
11029 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011030 memcpy(res + rkind * ires,
11031 sbuf + rkind * i,
11032 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 ires++;
11034 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011035 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011036 memcpy(res + rkind * ires,
11037 sbuf + rkind * i,
11038 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011039 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011040 }
11041
11042 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011043 unicode_adjust_maxchar(&u);
11044 if (u == NULL)
11045 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011047
11048 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011049 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11050 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11051 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011053 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011055 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011057 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011058 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011060
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011062 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011063 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11064 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11065 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011067 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011069 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011071 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011072 return unicode_result_unchanged(self);
11073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011075 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11076 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11077 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11078 if (srelease)
11079 PyMem_FREE((void *)sbuf);
11080 if (release1)
11081 PyMem_FREE((void *)buf1);
11082 if (release2)
11083 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085}
11086
11087/* --- Unicode Object Methods --------------------------------------------- */
11088
INADA Naoki3ae20562017-01-16 20:41:20 +090011089/*[clinic input]
11090str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091
INADA Naoki3ae20562017-01-16 20:41:20 +090011092Return a version of the string where each word is titlecased.
11093
11094More specifically, words start with uppercased characters and all remaining
11095cased characters have lower case.
11096[clinic start generated code]*/
11097
11098static PyObject *
11099unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011100/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011102 if (PyUnicode_READY(self) == -1)
11103 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011104 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105}
11106
INADA Naoki3ae20562017-01-16 20:41:20 +090011107/*[clinic input]
11108str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
INADA Naoki3ae20562017-01-16 20:41:20 +090011110Return a capitalized version of the string.
11111
11112More specifically, make the first character have upper case and the rest lower
11113case.
11114[clinic start generated code]*/
11115
11116static PyObject *
11117unicode_capitalize_impl(PyObject *self)
11118/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011120 if (PyUnicode_READY(self) == -1)
11121 return NULL;
11122 if (PyUnicode_GET_LENGTH(self) == 0)
11123 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011124 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125}
11126
INADA Naoki3ae20562017-01-16 20:41:20 +090011127/*[clinic input]
11128str.casefold as unicode_casefold
11129
11130Return a version of the string suitable for caseless comparisons.
11131[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011132
11133static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011134unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011135/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011136{
11137 if (PyUnicode_READY(self) == -1)
11138 return NULL;
11139 if (PyUnicode_IS_ASCII(self))
11140 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011141 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011142}
11143
11144
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011145/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011146
11147static int
11148convert_uc(PyObject *obj, void *addr)
11149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011151
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011152 if (!PyUnicode_Check(obj)) {
11153 PyErr_Format(PyExc_TypeError,
11154 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011155 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011156 return 0;
11157 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011158 if (PyUnicode_READY(obj) < 0)
11159 return 0;
11160 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011161 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011162 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011163 return 0;
11164 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011165 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011166 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011167}
11168
INADA Naoki3ae20562017-01-16 20:41:20 +090011169/*[clinic input]
11170str.center as unicode_center
11171
11172 width: Py_ssize_t
11173 fillchar: Py_UCS4 = ' '
11174 /
11175
11176Return a centered string of length width.
11177
11178Padding is done using the specified fill character (default is a space).
11179[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011182unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11183/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011185 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186
Benjamin Petersonbac79492012-01-14 13:34:47 -050011187 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 return NULL;
11189
Victor Stinnerc4b49542011-12-11 22:44:26 +010011190 if (PyUnicode_GET_LENGTH(self) >= width)
11191 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
Victor Stinnerc4b49542011-12-11 22:44:26 +010011193 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194 left = marg / 2 + (marg & width & 1);
11195
Victor Stinner9310abb2011-10-05 00:59:23 +020011196 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197}
11198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199/* This function assumes that str1 and str2 are readied by the caller. */
11200
Marc-André Lemburge5034372000-08-08 08:04:29 +000011201static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011202unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011203{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011204#define COMPARE(TYPE1, TYPE2) \
11205 do { \
11206 TYPE1* p1 = (TYPE1 *)data1; \
11207 TYPE2* p2 = (TYPE2 *)data2; \
11208 TYPE1* end = p1 + len; \
11209 Py_UCS4 c1, c2; \
11210 for (; p1 != end; p1++, p2++) { \
11211 c1 = *p1; \
11212 c2 = *p2; \
11213 if (c1 != c2) \
11214 return (c1 < c2) ? -1 : 1; \
11215 } \
11216 } \
11217 while (0)
11218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011220 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011221 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 kind1 = PyUnicode_KIND(str1);
11224 kind2 = PyUnicode_KIND(str2);
11225 data1 = PyUnicode_DATA(str1);
11226 data2 = PyUnicode_DATA(str2);
11227 len1 = PyUnicode_GET_LENGTH(str1);
11228 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011229 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011230
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011231 switch(kind1) {
11232 case PyUnicode_1BYTE_KIND:
11233 {
11234 switch(kind2) {
11235 case PyUnicode_1BYTE_KIND:
11236 {
11237 int cmp = memcmp(data1, data2, len);
11238 /* normalize result of memcmp() into the range [-1; 1] */
11239 if (cmp < 0)
11240 return -1;
11241 if (cmp > 0)
11242 return 1;
11243 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011244 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011245 case PyUnicode_2BYTE_KIND:
11246 COMPARE(Py_UCS1, Py_UCS2);
11247 break;
11248 case PyUnicode_4BYTE_KIND:
11249 COMPARE(Py_UCS1, Py_UCS4);
11250 break;
11251 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011252 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011253 }
11254 break;
11255 }
11256 case PyUnicode_2BYTE_KIND:
11257 {
11258 switch(kind2) {
11259 case PyUnicode_1BYTE_KIND:
11260 COMPARE(Py_UCS2, Py_UCS1);
11261 break;
11262 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011263 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011264 COMPARE(Py_UCS2, Py_UCS2);
11265 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011266 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011267 case PyUnicode_4BYTE_KIND:
11268 COMPARE(Py_UCS2, Py_UCS4);
11269 break;
11270 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011271 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011272 }
11273 break;
11274 }
11275 case PyUnicode_4BYTE_KIND:
11276 {
11277 switch(kind2) {
11278 case PyUnicode_1BYTE_KIND:
11279 COMPARE(Py_UCS4, Py_UCS1);
11280 break;
11281 case PyUnicode_2BYTE_KIND:
11282 COMPARE(Py_UCS4, Py_UCS2);
11283 break;
11284 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011285 {
11286#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11287 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11288 /* normalize result of wmemcmp() into the range [-1; 1] */
11289 if (cmp < 0)
11290 return -1;
11291 if (cmp > 0)
11292 return 1;
11293#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011294 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011295#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011296 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011297 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011298 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011299 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011300 }
11301 break;
11302 }
11303 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011304 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011305 }
11306
Victor Stinner770e19e2012-10-04 22:59:45 +020011307 if (len1 == len2)
11308 return 0;
11309 if (len1 < len2)
11310 return -1;
11311 else
11312 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011313
11314#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011315}
11316
Benjamin Peterson621b4302016-09-09 13:54:34 -070011317static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011318unicode_compare_eq(PyObject *str1, PyObject *str2)
11319{
11320 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011321 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011322 Py_ssize_t len;
11323 int cmp;
11324
Victor Stinnere5567ad2012-10-23 02:48:49 +020011325 len = PyUnicode_GET_LENGTH(str1);
11326 if (PyUnicode_GET_LENGTH(str2) != len)
11327 return 0;
11328 kind = PyUnicode_KIND(str1);
11329 if (PyUnicode_KIND(str2) != kind)
11330 return 0;
11331 data1 = PyUnicode_DATA(str1);
11332 data2 = PyUnicode_DATA(str2);
11333
11334 cmp = memcmp(data1, data2, len * kind);
11335 return (cmp == 0);
11336}
11337
11338
Alexander Belopolsky40018472011-02-26 01:02:56 +000011339int
11340PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11343 if (PyUnicode_READY(left) == -1 ||
11344 PyUnicode_READY(right) == -1)
11345 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011346
11347 /* a string is equal to itself */
11348 if (left == right)
11349 return 0;
11350
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011351 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011353 PyErr_Format(PyExc_TypeError,
11354 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011355 Py_TYPE(left)->tp_name,
11356 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357 return -1;
11358}
11359
Martin v. Löwis5b222132007-06-10 09:51:05 +000011360int
11361PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 Py_ssize_t i;
11364 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011366 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367
Victor Stinner910337b2011-10-03 03:20:16 +020011368 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011369 if (!PyUnicode_IS_READY(uni)) {
11370 const wchar_t *ws = _PyUnicode_WSTR(uni);
11371 /* Compare Unicode string and source character set string */
11372 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11373 if (chr != ustr[i])
11374 return (chr < ustr[i]) ? -1 : 1;
11375 }
11376 /* This check keeps Python strings that end in '\0' from comparing equal
11377 to C strings identical up to that point. */
11378 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11379 return 1; /* uni is longer */
11380 if (ustr[i])
11381 return -1; /* str is longer */
11382 return 0;
11383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011386 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011387 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011388 size_t len, len2 = strlen(str);
11389 int cmp;
11390
11391 len = Py_MIN(len1, len2);
11392 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011393 if (cmp != 0) {
11394 if (cmp < 0)
11395 return -1;
11396 else
11397 return 1;
11398 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011399 if (len1 > len2)
11400 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011401 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011402 return -1; /* str is longer */
11403 return 0;
11404 }
11405 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011406 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011407 /* Compare Unicode string and source character set string */
11408 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011409 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011410 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11411 /* This check keeps Python strings that end in '\0' from comparing equal
11412 to C strings identical up to that point. */
11413 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11414 return 1; /* uni is longer */
11415 if (str[i])
11416 return -1; /* str is longer */
11417 return 0;
11418 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011419}
11420
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011421static int
11422non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11423{
11424 size_t i, len;
11425 const wchar_t *p;
11426 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11427 if (strlen(str) != len)
11428 return 0;
11429 p = _PyUnicode_WSTR(unicode);
11430 assert(p);
11431 for (i = 0; i < len; i++) {
11432 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011433 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011434 return 0;
11435 }
11436 return 1;
11437}
11438
11439int
11440_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11441{
11442 size_t len;
11443 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011444 assert(str);
11445#ifndef NDEBUG
11446 for (const char *p = str; *p; p++) {
11447 assert((unsigned char)*p < 128);
11448 }
11449#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011450 if (PyUnicode_READY(unicode) == -1) {
11451 /* Memory error or bad data */
11452 PyErr_Clear();
11453 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11454 }
11455 if (!PyUnicode_IS_ASCII(unicode))
11456 return 0;
11457 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11458 return strlen(str) == len &&
11459 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11460}
11461
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011462int
11463_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11464{
11465 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011466
11467 assert(_PyUnicode_CHECK(left));
11468 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011469#ifndef NDEBUG
11470 for (const char *p = right->string; *p; p++) {
11471 assert((unsigned char)*p < 128);
11472 }
11473#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011474
11475 if (PyUnicode_READY(left) == -1) {
11476 /* memory error or bad data */
11477 PyErr_Clear();
11478 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11479 }
11480
11481 if (!PyUnicode_IS_ASCII(left))
11482 return 0;
11483
11484 right_uni = _PyUnicode_FromId(right); /* borrowed */
11485 if (right_uni == NULL) {
11486 /* memory error or bad data */
11487 PyErr_Clear();
11488 return _PyUnicode_EqualToASCIIString(left, right->string);
11489 }
11490
11491 if (left == right_uni)
11492 return 1;
11493
11494 if (PyUnicode_CHECK_INTERNED(left))
11495 return 0;
11496
Victor Stinner607b1022020-05-05 18:50:30 +020011497#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011498 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011499 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011500 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11501 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011502#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011503
11504 return unicode_compare_eq(left, right_uni);
11505}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011506
Alexander Belopolsky40018472011-02-26 01:02:56 +000011507PyObject *
11508PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011509{
11510 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011511
Victor Stinnere5567ad2012-10-23 02:48:49 +020011512 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11513 Py_RETURN_NOTIMPLEMENTED;
11514
11515 if (PyUnicode_READY(left) == -1 ||
11516 PyUnicode_READY(right) == -1)
11517 return NULL;
11518
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011519 if (left == right) {
11520 switch (op) {
11521 case Py_EQ:
11522 case Py_LE:
11523 case Py_GE:
11524 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011525 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011526 case Py_NE:
11527 case Py_LT:
11528 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011529 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011530 default:
11531 PyErr_BadArgument();
11532 return NULL;
11533 }
11534 }
11535 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011536 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011537 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011538 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011539 }
11540 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011541 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011542 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011543 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011544}
11545
Alexander Belopolsky40018472011-02-26 01:02:56 +000011546int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011547_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11548{
11549 return unicode_eq(aa, bb);
11550}
11551
11552int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011553PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011554{
Victor Stinner77282cb2013-04-14 19:22:47 +020011555 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011556 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011558 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011559
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011560 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011562 "'in <string>' requires string as left operand, not %.100s",
11563 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011564 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011565 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011566 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011567 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011568 if (ensure_unicode(str) < 0)
11569 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011572 kind2 = PyUnicode_KIND(substr);
11573 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011574 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011576 len2 = PyUnicode_GET_LENGTH(substr);
11577 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011578 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011579 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011580 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011581 if (len2 == 1) {
11582 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11583 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011584 return result;
11585 }
11586 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011587 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011588 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011589 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591
Victor Stinner77282cb2013-04-14 19:22:47 +020011592 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 case PyUnicode_1BYTE_KIND:
11594 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11595 break;
11596 case PyUnicode_2BYTE_KIND:
11597 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11598 break;
11599 case PyUnicode_4BYTE_KIND:
11600 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11601 break;
11602 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011603 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011605
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011606 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011607 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011608 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609
Guido van Rossum403d68b2000-03-13 15:55:09 +000011610 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011611}
11612
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613/* Concat to string or Unicode object giving a new Unicode object. */
11614
Alexander Belopolsky40018472011-02-26 01:02:56 +000011615PyObject *
11616PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011618 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011619 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011620 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011622 if (ensure_unicode(left) < 0)
11623 return NULL;
11624
11625 if (!PyUnicode_Check(right)) {
11626 PyErr_Format(PyExc_TypeError,
11627 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011628 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011629 return NULL;
11630 }
11631 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
11634 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011635 PyObject *empty = unicode_get_empty(); // Borrowed reference
11636 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011637 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011638 }
11639 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011640 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011643 left_len = PyUnicode_GET_LENGTH(left);
11644 right_len = PyUnicode_GET_LENGTH(right);
11645 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011646 PyErr_SetString(PyExc_OverflowError,
11647 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011648 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011649 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011650 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011651
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011652 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11653 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011654 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011657 result = PyUnicode_New(new_len, maxchar);
11658 if (result == NULL)
11659 return NULL;
11660 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11661 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11662 assert(_PyUnicode_CheckConsistency(result, 1));
11663 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664}
11665
Walter Dörwald1ab83302007-05-18 17:15:44 +000011666void
Victor Stinner23e56682011-10-03 03:54:37 +020011667PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011668{
Victor Stinner23e56682011-10-03 03:54:37 +020011669 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011670 Py_UCS4 maxchar, maxchar2;
11671 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011672
11673 if (p_left == NULL) {
11674 if (!PyErr_Occurred())
11675 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011676 return;
11677 }
Victor Stinner23e56682011-10-03 03:54:37 +020011678 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011679 if (right == NULL || left == NULL
11680 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011681 if (!PyErr_Occurred())
11682 PyErr_BadInternalCall();
11683 goto error;
11684 }
11685
Benjamin Petersonbac79492012-01-14 13:34:47 -050011686 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011687 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011688 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011689 goto error;
11690
Victor Stinner488fa492011-12-12 00:01:39 +010011691 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011692 PyObject *empty = unicode_get_empty(); // Borrowed reference
11693 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011694 Py_DECREF(left);
11695 Py_INCREF(right);
11696 *p_left = right;
11697 return;
11698 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011699 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011700 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011701 }
Victor Stinner488fa492011-12-12 00:01:39 +010011702
11703 left_len = PyUnicode_GET_LENGTH(left);
11704 right_len = PyUnicode_GET_LENGTH(right);
11705 if (left_len > PY_SSIZE_T_MAX - right_len) {
11706 PyErr_SetString(PyExc_OverflowError,
11707 "strings are too large to concat");
11708 goto error;
11709 }
11710 new_len = left_len + right_len;
11711
11712 if (unicode_modifiable(left)
11713 && PyUnicode_CheckExact(right)
11714 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011715 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11716 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011717 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011718 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011719 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11720 {
11721 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011722 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011723 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011724
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011725 /* copy 'right' into the newly allocated area of 'left' */
11726 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011727 }
Victor Stinner488fa492011-12-12 00:01:39 +010011728 else {
11729 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11730 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011731 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011732
Victor Stinner488fa492011-12-12 00:01:39 +010011733 /* Concat the two Unicode strings */
11734 res = PyUnicode_New(new_len, maxchar);
11735 if (res == NULL)
11736 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011737 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11738 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011739 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011740 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011741 }
11742 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011743 return;
11744
11745error:
Victor Stinner488fa492011-12-12 00:01:39 +010011746 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011747}
11748
11749void
11750PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11751{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752 PyUnicode_Append(pleft, right);
11753 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011754}
11755
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011756/*
11757Wraps stringlib_parse_args_finds() and additionally ensures that the
11758first argument is a unicode object.
11759*/
11760
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011761static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011762parse_args_finds_unicode(const char * function_name, PyObject *args,
11763 PyObject **substring,
11764 Py_ssize_t *start, Py_ssize_t *end)
11765{
11766 if(stringlib_parse_args_finds(function_name, args, substring,
11767 start, end)) {
11768 if (ensure_unicode(*substring) < 0)
11769 return 0;
11770 return 1;
11771 }
11772 return 0;
11773}
11774
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011775PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011778Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011779string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
11782static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011783unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011785 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011786 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011787 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011789 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011790 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011793 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 kind1 = PyUnicode_KIND(self);
11797 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011798 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011799 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 len1 = PyUnicode_GET_LENGTH(self);
11802 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011804 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011805 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011806
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011807 buf1 = PyUnicode_DATA(self);
11808 buf2 = PyUnicode_DATA(substring);
11809 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011810 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011811 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011812 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011813 }
11814 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 case PyUnicode_1BYTE_KIND:
11816 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011817 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 buf2, len2, PY_SSIZE_T_MAX
11819 );
11820 break;
11821 case PyUnicode_2BYTE_KIND:
11822 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011823 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 buf2, len2, PY_SSIZE_T_MAX
11825 );
11826 break;
11827 case PyUnicode_4BYTE_KIND:
11828 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011829 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 buf2, len2, PY_SSIZE_T_MAX
11831 );
11832 break;
11833 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011834 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 }
11836
11837 result = PyLong_FromSsize_t(iresult);
11838
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011839 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011840 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011841 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 return result;
11844}
11845
INADA Naoki3ae20562017-01-16 20:41:20 +090011846/*[clinic input]
11847str.encode as unicode_encode
11848
11849 encoding: str(c_default="NULL") = 'utf-8'
11850 The encoding in which to encode the string.
11851 errors: str(c_default="NULL") = 'strict'
11852 The error handling scheme to use for encoding errors.
11853 The default is 'strict' meaning that encoding errors raise a
11854 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11855 'xmlcharrefreplace' as well as any other name registered with
11856 codecs.register_error that can handle UnicodeEncodeErrors.
11857
11858Encode the string using the codec registered for encoding.
11859[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
11861static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011862unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011863/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011865 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011866}
11867
INADA Naoki3ae20562017-01-16 20:41:20 +090011868/*[clinic input]
11869str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
INADA Naoki3ae20562017-01-16 20:41:20 +090011871 tabsize: int = 8
11872
11873Return a copy where all tab characters are expanded using spaces.
11874
11875If tabsize is not given, a tab size of 8 characters is assumed.
11876[clinic start generated code]*/
11877
11878static PyObject *
11879unicode_expandtabs_impl(PyObject *self, int tabsize)
11880/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011882 Py_ssize_t i, j, line_pos, src_len, incr;
11883 Py_UCS4 ch;
11884 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011885 const void *src_data;
11886 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011887 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011888 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
Antoine Pitrou22425222011-10-04 19:10:51 +020011890 if (PyUnicode_READY(self) == -1)
11891 return NULL;
11892
Thomas Wouters7e474022000-07-16 12:04:32 +000011893 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011894 src_len = PyUnicode_GET_LENGTH(self);
11895 i = j = line_pos = 0;
11896 kind = PyUnicode_KIND(self);
11897 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011898 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011899 for (; i < src_len; i++) {
11900 ch = PyUnicode_READ(kind, src_data, i);
11901 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011902 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011904 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011906 goto overflow;
11907 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011909 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011913 goto overflow;
11914 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011916 if (ch == '\n' || ch == '\r')
11917 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011919 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011920 if (!found)
11921 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011922
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011924 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 if (!u)
11926 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011927 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
Antoine Pitroue71d5742011-10-04 15:55:09 +020011929 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930
Antoine Pitroue71d5742011-10-04 15:55:09 +020011931 for (; i < src_len; i++) {
11932 ch = PyUnicode_READ(kind, src_data, i);
11933 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011935 incr = tabsize - (line_pos % tabsize);
11936 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011937 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011938 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011940 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011942 line_pos++;
11943 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011944 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011945 if (ch == '\n' || ch == '\r')
11946 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011948 }
11949 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011950 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011951
Antoine Pitroue71d5742011-10-04 15:55:09 +020011952 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011953 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955}
11956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011957PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011958 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959\n\
11960Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011961such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962arguments start and end are interpreted as in slice notation.\n\
11963\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011964Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
11966static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011969 /* initialize variables to prevent gcc warning */
11970 PyObject *substring = NULL;
11971 Py_ssize_t start = 0;
11972 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011973 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011975 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011978 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011981 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (result == -2)
11984 return NULL;
11985
Christian Heimes217cfd12007-12-02 14:31:20 +000011986 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987}
11988
11989static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011990unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011992 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011993 enum PyUnicode_Kind kind;
11994 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011995
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011996 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011997 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011999 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012000 if (PyUnicode_READY(self) == -1) {
12001 return NULL;
12002 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012003 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12004 PyErr_SetString(PyExc_IndexError, "string index out of range");
12005 return NULL;
12006 }
12007 kind = PyUnicode_KIND(self);
12008 data = PyUnicode_DATA(self);
12009 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012010 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011}
12012
Guido van Rossumc2504932007-09-18 19:42:40 +000012013/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012014 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012015static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012016unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012018 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012019
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012020#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012021 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012022#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (_PyUnicode_HASH(self) != -1)
12024 return _PyUnicode_HASH(self);
12025 if (PyUnicode_READY(self) == -1)
12026 return -1;
animalizea1d14252019-01-02 20:16:06 +080012027
Christian Heimes985ecdc2013-11-20 11:46:18 +010012028 x = _Py_HashBytes(PyUnicode_DATA(self),
12029 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012031 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032}
12033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012034PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012035 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036\n\
oldkaa0735f2018-02-02 16:52:55 +080012037Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012038such that sub is contained within S[start:end]. Optional\n\
12039arguments start and end are interpreted as in slice notation.\n\
12040\n\
12041Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
12043static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012046 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012047 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012048 PyObject *substring = NULL;
12049 Py_ssize_t start = 0;
12050 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012052 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012055 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012058 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (result == -2)
12061 return NULL;
12062
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 if (result < 0) {
12064 PyErr_SetString(PyExc_ValueError, "substring not found");
12065 return NULL;
12066 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012067
Christian Heimes217cfd12007-12-02 14:31:20 +000012068 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069}
12070
INADA Naoki3ae20562017-01-16 20:41:20 +090012071/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012072str.isascii as unicode_isascii
12073
12074Return True if all characters in the string are ASCII, False otherwise.
12075
12076ASCII characters have code points in the range U+0000-U+007F.
12077Empty string is ASCII too.
12078[clinic start generated code]*/
12079
12080static PyObject *
12081unicode_isascii_impl(PyObject *self)
12082/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12083{
12084 if (PyUnicode_READY(self) == -1) {
12085 return NULL;
12086 }
12087 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12088}
12089
12090/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012091str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092
INADA Naoki3ae20562017-01-16 20:41:20 +090012093Return True if the string is a lowercase string, False otherwise.
12094
12095A string is lowercase if all cased characters in the string are lowercase and
12096there is at least one cased character in the string.
12097[clinic start generated code]*/
12098
12099static PyObject *
12100unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012101/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 Py_ssize_t i, length;
12104 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012105 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106 int cased;
12107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (PyUnicode_READY(self) == -1)
12109 return NULL;
12110 length = PyUnicode_GET_LENGTH(self);
12111 kind = PyUnicode_KIND(self);
12112 data = PyUnicode_DATA(self);
12113
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (length == 1)
12116 return PyBool_FromLong(
12117 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012119 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012121 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012122
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 for (i = 0; i < length; i++) {
12125 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012126
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012128 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012129 else if (!cased && Py_UNICODE_ISLOWER(ch))
12130 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012132 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133}
12134
INADA Naoki3ae20562017-01-16 20:41:20 +090012135/*[clinic input]
12136str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137
INADA Naoki3ae20562017-01-16 20:41:20 +090012138Return True if the string is an uppercase string, False otherwise.
12139
12140A string is uppercase if all cased characters in the string are uppercase and
12141there is at least one cased character in the string.
12142[clinic start generated code]*/
12143
12144static PyObject *
12145unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012146/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 Py_ssize_t i, length;
12149 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012150 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151 int cased;
12152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 if (PyUnicode_READY(self) == -1)
12154 return NULL;
12155 length = PyUnicode_GET_LENGTH(self);
12156 kind = PyUnicode_KIND(self);
12157 data = PyUnicode_DATA(self);
12158
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 if (length == 1)
12161 return PyBool_FromLong(
12162 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012164 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012166 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012167
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 for (i = 0; i < length; i++) {
12170 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012171
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012173 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012174 else if (!cased && Py_UNICODE_ISUPPER(ch))
12175 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012177 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178}
12179
INADA Naoki3ae20562017-01-16 20:41:20 +090012180/*[clinic input]
12181str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
INADA Naoki3ae20562017-01-16 20:41:20 +090012183Return True if the string is a title-cased string, False otherwise.
12184
12185In a title-cased string, upper- and title-case characters may only
12186follow uncased characters and lowercase characters only cased ones.
12187[clinic start generated code]*/
12188
12189static PyObject *
12190unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012191/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 Py_ssize_t i, length;
12194 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012195 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196 int cased, previous_is_cased;
12197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 if (PyUnicode_READY(self) == -1)
12199 return NULL;
12200 length = PyUnicode_GET_LENGTH(self);
12201 kind = PyUnicode_KIND(self);
12202 data = PyUnicode_DATA(self);
12203
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 if (length == 1) {
12206 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12207 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12208 (Py_UNICODE_ISUPPER(ch) != 0));
12209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012211 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012213 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012214
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215 cased = 0;
12216 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 for (i = 0; i < length; i++) {
12218 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012219
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12221 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012222 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 previous_is_cased = 1;
12224 cased = 1;
12225 }
12226 else if (Py_UNICODE_ISLOWER(ch)) {
12227 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012228 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 previous_is_cased = 1;
12230 cased = 1;
12231 }
12232 else
12233 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012235 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236}
12237
INADA Naoki3ae20562017-01-16 20:41:20 +090012238/*[clinic input]
12239str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240
INADA Naoki3ae20562017-01-16 20:41:20 +090012241Return True if the string is a whitespace string, False otherwise.
12242
12243A string is whitespace if all characters in the string are whitespace and there
12244is at least one character in the string.
12245[clinic start generated code]*/
12246
12247static PyObject *
12248unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012249/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 Py_ssize_t i, length;
12252 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012253 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254
12255 if (PyUnicode_READY(self) == -1)
12256 return NULL;
12257 length = PyUnicode_GET_LENGTH(self);
12258 kind = PyUnicode_KIND(self);
12259 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 if (length == 1)
12263 return PyBool_FromLong(
12264 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012266 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012268 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 for (i = 0; i < length; i++) {
12271 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012272 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012273 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012275 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276}
12277
INADA Naoki3ae20562017-01-16 20:41:20 +090012278/*[clinic input]
12279str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281Return True if the string is an alphabetic string, False otherwise.
12282
12283A string is alphabetic if all characters in the string are alphabetic and there
12284is at least one character in the string.
12285[clinic start generated code]*/
12286
12287static PyObject *
12288unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012289/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 Py_ssize_t i, length;
12292 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012293 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294
12295 if (PyUnicode_READY(self) == -1)
12296 return NULL;
12297 length = PyUnicode_GET_LENGTH(self);
12298 kind = PyUnicode_KIND(self);
12299 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012300
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012301 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (length == 1)
12303 return PyBool_FromLong(
12304 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012305
12306 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012308 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 for (i = 0; i < length; i++) {
12311 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012312 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012313 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012314 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012315}
12316
INADA Naoki3ae20562017-01-16 20:41:20 +090012317/*[clinic input]
12318str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012319
INADA Naoki3ae20562017-01-16 20:41:20 +090012320Return True if the string is an alpha-numeric string, False otherwise.
12321
12322A string is alpha-numeric if all characters in the string are alpha-numeric and
12323there is at least one character in the string.
12324[clinic start generated code]*/
12325
12326static PyObject *
12327unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012328/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012331 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 Py_ssize_t len, i;
12333
12334 if (PyUnicode_READY(self) == -1)
12335 return NULL;
12336
12337 kind = PyUnicode_KIND(self);
12338 data = PyUnicode_DATA(self);
12339 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012340
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012341 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 if (len == 1) {
12343 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12344 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12345 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012346
12347 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012349 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 for (i = 0; i < len; i++) {
12352 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012353 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012354 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012355 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012356 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012357}
12358
INADA Naoki3ae20562017-01-16 20:41:20 +090012359/*[clinic input]
12360str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
INADA Naoki3ae20562017-01-16 20:41:20 +090012362Return True if the string is a decimal string, False otherwise.
12363
12364A string is a decimal string if all characters in the string are decimal and
12365there is at least one character in the string.
12366[clinic start generated code]*/
12367
12368static PyObject *
12369unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012370/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 Py_ssize_t i, length;
12373 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012374 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375
12376 if (PyUnicode_READY(self) == -1)
12377 return NULL;
12378 length = PyUnicode_GET_LENGTH(self);
12379 kind = PyUnicode_KIND(self);
12380 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 if (length == 1)
12384 return PyBool_FromLong(
12385 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012387 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012389 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 for (i = 0; i < length; i++) {
12392 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012393 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012395 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396}
12397
INADA Naoki3ae20562017-01-16 20:41:20 +090012398/*[clinic input]
12399str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400
INADA Naoki3ae20562017-01-16 20:41:20 +090012401Return True if the string is a digit string, False otherwise.
12402
12403A string is a digit string if all characters in the string are digits and there
12404is at least one character in the string.
12405[clinic start generated code]*/
12406
12407static PyObject *
12408unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012409/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 Py_ssize_t i, length;
12412 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012413 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414
12415 if (PyUnicode_READY(self) == -1)
12416 return NULL;
12417 length = PyUnicode_GET_LENGTH(self);
12418 kind = PyUnicode_KIND(self);
12419 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 if (length == 1) {
12423 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12424 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012427 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012429 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 for (i = 0; i < length; i++) {
12432 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012433 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012435 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436}
12437
INADA Naoki3ae20562017-01-16 20:41:20 +090012438/*[clinic input]
12439str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440
INADA Naoki3ae20562017-01-16 20:41:20 +090012441Return True if the string is a numeric string, False otherwise.
12442
12443A string is numeric if all characters in the string are numeric and there is at
12444least one character in the string.
12445[clinic start generated code]*/
12446
12447static PyObject *
12448unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012449/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 Py_ssize_t i, length;
12452 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012453 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454
12455 if (PyUnicode_READY(self) == -1)
12456 return NULL;
12457 length = PyUnicode_GET_LENGTH(self);
12458 kind = PyUnicode_KIND(self);
12459 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 if (length == 1)
12463 return PyBool_FromLong(
12464 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012466 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012468 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 for (i = 0; i < length; i++) {
12471 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012472 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012474 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475}
12476
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012477Py_ssize_t
12478_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012481 if (PyUnicode_READY(self) == -1)
12482 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012483
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012484 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012485 if (len == 0) {
12486 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 }
12489
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012490 int kind = PyUnicode_KIND(self);
12491 const void *data = PyUnicode_DATA(self);
12492 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012493 /* PEP 3131 says that the first character must be in
12494 XID_Start and subsequent characters in XID_Continue,
12495 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012496 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012497 letters, digits, underscore). However, given the current
12498 definition of XID_Start and XID_Continue, it is sufficient
12499 to check just for these, except that _ must be allowed
12500 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012501 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012502 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012503 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012504
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012505 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012506 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012507 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012508 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012509 }
12510 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012511 return i;
12512}
12513
12514int
12515PyUnicode_IsIdentifier(PyObject *self)
12516{
12517 if (PyUnicode_IS_READY(self)) {
12518 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12519 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12520 /* an empty string is not a valid identifier */
12521 return len && i == len;
12522 }
12523 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012524_Py_COMP_DIAG_PUSH
12525_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012526 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012527 if (len == 0) {
12528 /* an empty string is not a valid identifier */
12529 return 0;
12530 }
12531
12532 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012533 Py_UCS4 ch = wstr[i++];
12534#if SIZEOF_WCHAR_T == 2
12535 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12536 && i < len
12537 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12538 {
12539 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12540 i++;
12541 }
12542#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012543 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12544 return 0;
12545 }
12546
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012547 while (i < len) {
12548 ch = wstr[i++];
12549#if SIZEOF_WCHAR_T == 2
12550 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12551 && i < len
12552 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12553 {
12554 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12555 i++;
12556 }
12557#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012558 if (!_PyUnicode_IsXidContinue(ch)) {
12559 return 0;
12560 }
12561 }
12562 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012563_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012564 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012565}
12566
INADA Naoki3ae20562017-01-16 20:41:20 +090012567/*[clinic input]
12568str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012569
INADA Naoki3ae20562017-01-16 20:41:20 +090012570Return True if the string is a valid Python identifier, False otherwise.
12571
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012572Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012573such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012574[clinic start generated code]*/
12575
12576static PyObject *
12577unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012578/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012579{
12580 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12581}
12582
INADA Naoki3ae20562017-01-16 20:41:20 +090012583/*[clinic input]
12584str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012585
INADA Naoki3ae20562017-01-16 20:41:20 +090012586Return True if the string is printable, False otherwise.
12587
12588A string is printable if all of its characters are considered printable in
12589repr() or if it is empty.
12590[clinic start generated code]*/
12591
12592static PyObject *
12593unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012594/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 Py_ssize_t i, length;
12597 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012598 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599
12600 if (PyUnicode_READY(self) == -1)
12601 return NULL;
12602 length = PyUnicode_GET_LENGTH(self);
12603 kind = PyUnicode_KIND(self);
12604 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012605
12606 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 if (length == 1)
12608 return PyBool_FromLong(
12609 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 for (i = 0; i < length; i++) {
12612 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012613 Py_RETURN_FALSE;
12614 }
12615 }
12616 Py_RETURN_TRUE;
12617}
12618
INADA Naoki3ae20562017-01-16 20:41:20 +090012619/*[clinic input]
12620str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621
INADA Naoki3ae20562017-01-16 20:41:20 +090012622 iterable: object
12623 /
12624
12625Concatenate any number of strings.
12626
Martin Panter91a88662017-01-24 00:30:06 +000012627The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012628The result is returned as a new string.
12629
12630Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12631[clinic start generated code]*/
12632
12633static PyObject *
12634unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012635/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636{
INADA Naoki3ae20562017-01-16 20:41:20 +090012637 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638}
12639
Martin v. Löwis18e16552006-02-15 17:27:45 +000012640static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012641unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 if (PyUnicode_READY(self) == -1)
12644 return -1;
12645 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646}
12647
INADA Naoki3ae20562017-01-16 20:41:20 +090012648/*[clinic input]
12649str.ljust as unicode_ljust
12650
12651 width: Py_ssize_t
12652 fillchar: Py_UCS4 = ' '
12653 /
12654
12655Return a left-justified string of length width.
12656
12657Padding is done using the specified fill character (default is a space).
12658[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659
12660static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012661unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12662/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012664 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666
Victor Stinnerc4b49542011-12-11 22:44:26 +010012667 if (PyUnicode_GET_LENGTH(self) >= width)
12668 return unicode_result_unchanged(self);
12669
12670 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671}
12672
INADA Naoki3ae20562017-01-16 20:41:20 +090012673/*[clinic input]
12674str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
INADA Naoki3ae20562017-01-16 20:41:20 +090012676Return a copy of the string converted to lowercase.
12677[clinic start generated code]*/
12678
12679static PyObject *
12680unicode_lower_impl(PyObject *self)
12681/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012683 if (PyUnicode_READY(self) == -1)
12684 return NULL;
12685 if (PyUnicode_IS_ASCII(self))
12686 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012687 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688}
12689
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012690#define LEFTSTRIP 0
12691#define RIGHTSTRIP 1
12692#define BOTHSTRIP 2
12693
12694/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012695static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012696
INADA Naoki3ae20562017-01-16 20:41:20 +090012697#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012698
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012699/* externally visible for str.strip(unicode) */
12700PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012701_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012702{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012703 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 int kind;
12705 Py_ssize_t i, j, len;
12706 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012707 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12710 return NULL;
12711
12712 kind = PyUnicode_KIND(self);
12713 data = PyUnicode_DATA(self);
12714 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012715 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12717 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012718 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012719
Benjamin Peterson14339b62009-01-31 16:36:08 +000012720 i = 0;
12721 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012722 while (i < len) {
12723 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12724 if (!BLOOM(sepmask, ch))
12725 break;
12726 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12727 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 i++;
12729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012730 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012731
Benjamin Peterson14339b62009-01-31 16:36:08 +000012732 j = len;
12733 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012734 j--;
12735 while (j >= i) {
12736 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12737 if (!BLOOM(sepmask, ch))
12738 break;
12739 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12740 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012742 }
12743
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012746
Victor Stinner7931d9a2011-11-04 00:22:48 +010012747 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748}
12749
12750PyObject*
12751PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12752{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012753 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012755 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756
Victor Stinnerde636f32011-10-01 03:55:54 +020012757 if (PyUnicode_READY(self) == -1)
12758 return NULL;
12759
Victor Stinner684d5fd2012-05-03 02:32:34 +020012760 length = PyUnicode_GET_LENGTH(self);
12761 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012762
Victor Stinner684d5fd2012-05-03 02:32:34 +020012763 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012764 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765
Victor Stinnerde636f32011-10-01 03:55:54 +020012766 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012767 PyErr_SetString(PyExc_IndexError, "string index out of range");
12768 return NULL;
12769 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012770 if (start >= length || end < start)
12771 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012772
Victor Stinner684d5fd2012-05-03 02:32:34 +020012773 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012774 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012775 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012776 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012777 }
12778 else {
12779 kind = PyUnicode_KIND(self);
12780 data = PyUnicode_1BYTE_DATA(self);
12781 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012782 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012783 length);
12784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786
12787static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012788do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 Py_ssize_t len, i, j;
12791
12792 if (PyUnicode_READY(self) == -1)
12793 return NULL;
12794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012796
Victor Stinnercc7af722013-04-09 22:39:24 +020012797 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012798 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012799
12800 i = 0;
12801 if (striptype != RIGHTSTRIP) {
12802 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012803 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012804 if (!_Py_ascii_whitespace[ch])
12805 break;
12806 i++;
12807 }
12808 }
12809
12810 j = len;
12811 if (striptype != LEFTSTRIP) {
12812 j--;
12813 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012814 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012815 if (!_Py_ascii_whitespace[ch])
12816 break;
12817 j--;
12818 }
12819 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 }
12821 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012822 else {
12823 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012824 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012825
Victor Stinnercc7af722013-04-09 22:39:24 +020012826 i = 0;
12827 if (striptype != RIGHTSTRIP) {
12828 while (i < len) {
12829 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12830 if (!Py_UNICODE_ISSPACE(ch))
12831 break;
12832 i++;
12833 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012834 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012835
12836 j = len;
12837 if (striptype != LEFTSTRIP) {
12838 j--;
12839 while (j >= i) {
12840 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12841 if (!Py_UNICODE_ISSPACE(ch))
12842 break;
12843 j--;
12844 }
12845 j++;
12846 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012847 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012848
Victor Stinner7931d9a2011-11-04 00:22:48 +010012849 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850}
12851
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012852
12853static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012854do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012855{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012856 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012857 if (PyUnicode_Check(sep))
12858 return _PyUnicode_XStrip(self, striptype, sep);
12859 else {
12860 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012861 "%s arg must be None or str",
12862 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012863 return NULL;
12864 }
12865 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012866
Benjamin Peterson14339b62009-01-31 16:36:08 +000012867 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012868}
12869
12870
INADA Naoki3ae20562017-01-16 20:41:20 +090012871/*[clinic input]
12872str.strip as unicode_strip
12873
12874 chars: object = None
12875 /
12876
Zachary Ware09895c22019-10-09 16:09:00 -050012877Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012878
12879If chars is given and not None, remove characters in chars instead.
12880[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012881
12882static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012883unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012884/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012885{
INADA Naoki3ae20562017-01-16 20:41:20 +090012886 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012887}
12888
12889
INADA Naoki3ae20562017-01-16 20:41:20 +090012890/*[clinic input]
12891str.lstrip as unicode_lstrip
12892
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012893 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012894 /
12895
12896Return a copy of the string with leading whitespace removed.
12897
12898If chars is given and not None, remove characters in chars instead.
12899[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012900
12901static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012902unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012903/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012904{
INADA Naoki3ae20562017-01-16 20:41:20 +090012905 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012906}
12907
12908
INADA Naoki3ae20562017-01-16 20:41:20 +090012909/*[clinic input]
12910str.rstrip as unicode_rstrip
12911
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012912 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012913 /
12914
12915Return a copy of the string with trailing whitespace removed.
12916
12917If chars is given and not None, remove characters in chars instead.
12918[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012919
12920static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012921unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012922/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012923{
INADA Naoki3ae20562017-01-16 20:41:20 +090012924 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012925}
12926
12927
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012929unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012931 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933
Serhiy Storchaka05997252013-01-26 12:14:02 +020012934 if (len < 1)
12935 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936
Victor Stinnerc4b49542011-12-11 22:44:26 +010012937 /* no repeat, return original string */
12938 if (len == 1)
12939 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012940
Benjamin Petersonbac79492012-01-14 13:34:47 -050012941 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 return NULL;
12943
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012944 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012945 PyErr_SetString(PyExc_OverflowError,
12946 "repeated string is too long");
12947 return NULL;
12948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012950
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012951 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952 if (!u)
12953 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012954 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012957 int kind = PyUnicode_KIND(str);
12958 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012959 if (kind == PyUnicode_1BYTE_KIND) {
12960 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012961 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012962 }
12963 else if (kind == PyUnicode_2BYTE_KIND) {
12964 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012965 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012966 ucs2[n] = fill_char;
12967 } else {
12968 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12969 assert(kind == PyUnicode_4BYTE_KIND);
12970 for (n = 0; n < len; ++n)
12971 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 }
12974 else {
12975 /* number of characters copied this far */
12976 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012977 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012979 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012983 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986 }
12987
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012988 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012989 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990}
12991
Alexander Belopolsky40018472011-02-26 01:02:56 +000012992PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012993PyUnicode_Replace(PyObject *str,
12994 PyObject *substr,
12995 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012996 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012998 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12999 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013001 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002}
13003
INADA Naoki3ae20562017-01-16 20:41:20 +090013004/*[clinic input]
13005str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006
INADA Naoki3ae20562017-01-16 20:41:20 +090013007 old: unicode
13008 new: unicode
13009 count: Py_ssize_t = -1
13010 Maximum number of occurrences to replace.
13011 -1 (the default value) means replace all occurrences.
13012 /
13013
13014Return a copy with all occurrences of substring old replaced by new.
13015
13016If the optional argument count is given, only the first count occurrences are
13017replaced.
13018[clinic start generated code]*/
13019
13020static PyObject *
13021unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13022 Py_ssize_t count)
13023/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013024{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013025 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013027 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028}
13029
sweeneydea81849b2020-04-22 17:05:48 -040013030/*[clinic input]
13031str.removeprefix as unicode_removeprefix
13032
13033 prefix: unicode
13034 /
13035
13036Return a str with the given prefix string removed if present.
13037
13038If the string starts with the prefix string, return string[len(prefix):].
13039Otherwise, return a copy of the original string.
13040[clinic start generated code]*/
13041
13042static PyObject *
13043unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13044/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13045{
13046 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13047 if (match == -1) {
13048 return NULL;
13049 }
13050 if (match) {
13051 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13052 PyUnicode_GET_LENGTH(self));
13053 }
13054 return unicode_result_unchanged(self);
13055}
13056
13057/*[clinic input]
13058str.removesuffix as unicode_removesuffix
13059
13060 suffix: unicode
13061 /
13062
13063Return a str with the given suffix string removed if present.
13064
13065If the string ends with the suffix string and that suffix is not empty,
13066return string[:-len(suffix)]. Otherwise, return a copy of the original
13067string.
13068[clinic start generated code]*/
13069
13070static PyObject *
13071unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13072/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13073{
13074 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13075 if (match == -1) {
13076 return NULL;
13077 }
13078 if (match) {
13079 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13080 - PyUnicode_GET_LENGTH(suffix));
13081 }
13082 return unicode_result_unchanged(self);
13083}
13084
Alexander Belopolsky40018472011-02-26 01:02:56 +000013085static PyObject *
13086unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013088 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013089 Py_ssize_t isize;
13090 Py_ssize_t osize, squote, dquote, i, o;
13091 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013092 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013093 const void *idata;
13094 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013097 return NULL;
13098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 isize = PyUnicode_GET_LENGTH(unicode);
13100 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 /* Compute length of output, quote characters, and
13103 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013104 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 max = 127;
13106 squote = dquote = 0;
13107 ikind = PyUnicode_KIND(unicode);
13108 for (i = 0; i < isize; i++) {
13109 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013110 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013112 case '\'': squote++; break;
13113 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013115 incr = 2;
13116 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 default:
13118 /* Fast-path ASCII */
13119 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013120 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013122 ;
13123 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013126 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013128 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013130 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013132 if (osize > PY_SSIZE_T_MAX - incr) {
13133 PyErr_SetString(PyExc_OverflowError,
13134 "string is too long to generate repr");
13135 return NULL;
13136 }
13137 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 }
13139
13140 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013141 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013143 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 if (dquote)
13145 /* Both squote and dquote present. Use squote,
13146 and escape them */
13147 osize += squote;
13148 else
13149 quote = '"';
13150 }
Victor Stinner55c08782013-04-14 18:45:39 +020013151 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152
13153 repr = PyUnicode_New(osize, max);
13154 if (repr == NULL)
13155 return NULL;
13156 okind = PyUnicode_KIND(repr);
13157 odata = PyUnicode_DATA(repr);
13158
13159 PyUnicode_WRITE(okind, odata, 0, quote);
13160 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013161 if (unchanged) {
13162 _PyUnicode_FastCopyCharacters(repr, 1,
13163 unicode, 0,
13164 isize);
13165 }
13166 else {
13167 for (i = 0, o = 1; i < isize; i++) {
13168 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169
Victor Stinner55c08782013-04-14 18:45:39 +020013170 /* Escape quotes and backslashes */
13171 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013172 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013174 continue;
13175 }
13176
13177 /* Map special whitespace to '\t', \n', '\r' */
13178 if (ch == '\t') {
13179 PyUnicode_WRITE(okind, odata, o++, '\\');
13180 PyUnicode_WRITE(okind, odata, o++, 't');
13181 }
13182 else if (ch == '\n') {
13183 PyUnicode_WRITE(okind, odata, o++, '\\');
13184 PyUnicode_WRITE(okind, odata, o++, 'n');
13185 }
13186 else if (ch == '\r') {
13187 PyUnicode_WRITE(okind, odata, o++, '\\');
13188 PyUnicode_WRITE(okind, odata, o++, 'r');
13189 }
13190
13191 /* Map non-printable US ASCII to '\xhh' */
13192 else if (ch < ' ' || ch == 0x7F) {
13193 PyUnicode_WRITE(okind, odata, o++, '\\');
13194 PyUnicode_WRITE(okind, odata, o++, 'x');
13195 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13196 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13197 }
13198
13199 /* Copy ASCII characters as-is */
13200 else if (ch < 0x7F) {
13201 PyUnicode_WRITE(okind, odata, o++, ch);
13202 }
13203
13204 /* Non-ASCII characters */
13205 else {
13206 /* Map Unicode whitespace and control characters
13207 (categories Z* and C* except ASCII space)
13208 */
13209 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13210 PyUnicode_WRITE(okind, odata, o++, '\\');
13211 /* Map 8-bit characters to '\xhh' */
13212 if (ch <= 0xff) {
13213 PyUnicode_WRITE(okind, odata, o++, 'x');
13214 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13215 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13216 }
13217 /* Map 16-bit characters to '\uxxxx' */
13218 else if (ch <= 0xffff) {
13219 PyUnicode_WRITE(okind, odata, o++, 'u');
13220 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13222 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13223 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13224 }
13225 /* Map 21-bit characters to '\U00xxxxxx' */
13226 else {
13227 PyUnicode_WRITE(okind, odata, o++, 'U');
13228 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13229 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13230 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13231 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13232 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13233 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13234 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13235 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13236 }
13237 }
13238 /* Copy characters as-is */
13239 else {
13240 PyUnicode_WRITE(okind, odata, o++, ch);
13241 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013242 }
13243 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013246 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013247 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248}
13249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013250PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013251 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252\n\
13253Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013254such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255arguments start and end are interpreted as in slice notation.\n\
13256\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013257Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258
13259static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013262 /* initialize variables to prevent gcc warning */
13263 PyObject *substring = NULL;
13264 Py_ssize_t start = 0;
13265 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013266 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013268 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013271 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013274 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 if (result == -2)
13277 return NULL;
13278
Christian Heimes217cfd12007-12-02 14:31:20 +000013279 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280}
13281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013282PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013285Return the highest index in S where substring sub is found,\n\
13286such that sub is contained within S[start:end]. Optional\n\
13287arguments start and end are interpreted as in slice notation.\n\
13288\n\
13289Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290
13291static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013294 /* initialize variables to prevent gcc warning */
13295 PyObject *substring = NULL;
13296 Py_ssize_t start = 0;
13297 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013298 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013300 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013301 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013303 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013306 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 if (result == -2)
13309 return NULL;
13310
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311 if (result < 0) {
13312 PyErr_SetString(PyExc_ValueError, "substring not found");
13313 return NULL;
13314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315
Christian Heimes217cfd12007-12-02 14:31:20 +000013316 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013317}
13318
INADA Naoki3ae20562017-01-16 20:41:20 +090013319/*[clinic input]
13320str.rjust as unicode_rjust
13321
13322 width: Py_ssize_t
13323 fillchar: Py_UCS4 = ' '
13324 /
13325
13326Return a right-justified string of length width.
13327
13328Padding is done using the specified fill character (default is a space).
13329[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330
13331static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013332unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13333/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013335 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336 return NULL;
13337
Victor Stinnerc4b49542011-12-11 22:44:26 +010013338 if (PyUnicode_GET_LENGTH(self) >= width)
13339 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340
Victor Stinnerc4b49542011-12-11 22:44:26 +010013341 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342}
13343
Alexander Belopolsky40018472011-02-26 01:02:56 +000013344PyObject *
13345PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013347 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013350 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351}
13352
INADA Naoki3ae20562017-01-16 20:41:20 +090013353/*[clinic input]
13354str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355
INADA Naoki3ae20562017-01-16 20:41:20 +090013356 sep: object = None
13357 The delimiter according which to split the string.
13358 None (the default value) means split according to any whitespace,
13359 and discard empty strings from the result.
13360 maxsplit: Py_ssize_t = -1
13361 Maximum number of splits to do.
13362 -1 (the default value) means no limit.
13363
13364Return a list of the words in the string, using sep as the delimiter string.
13365[clinic start generated code]*/
13366
13367static PyObject *
13368unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13369/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370{
INADA Naoki3ae20562017-01-16 20:41:20 +090013371 if (sep == Py_None)
13372 return split(self, NULL, maxsplit);
13373 if (PyUnicode_Check(sep))
13374 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013375
Victor Stinner998b8062018-09-12 00:23:25 +020013376 PyErr_Format(PyExc_TypeError,
13377 "must be str or None, not %.100s",
13378 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380}
13381
Thomas Wouters477c8d52006-05-27 19:21:47 +000013382PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013383PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013384{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013385 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013386 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013387 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013388 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013389
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013390 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013392
Victor Stinner14f8f022011-10-05 20:58:25 +020013393 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013394 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 len1 = PyUnicode_GET_LENGTH(str_obj);
13396 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013397 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013398 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013399 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013400 }
13401 buf1 = PyUnicode_DATA(str_obj);
13402 buf2 = PyUnicode_DATA(sep_obj);
13403 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013404 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013405 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013406 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013408
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013409 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013411 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13412 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13413 else
13414 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013415 break;
13416 case PyUnicode_2BYTE_KIND:
13417 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13418 break;
13419 case PyUnicode_4BYTE_KIND:
13420 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13421 break;
13422 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013423 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013425
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013426 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013427 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013428 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013429
13430 return out;
13431}
13432
13433
13434PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013435PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013436{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013437 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013438 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013439 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013440 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013441
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013442 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013444
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013445 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 len1 = PyUnicode_GET_LENGTH(str_obj);
13448 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013449 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013450 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013451 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013452 }
13453 buf1 = PyUnicode_DATA(str_obj);
13454 buf2 = PyUnicode_DATA(sep_obj);
13455 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013456 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013457 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013458 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013461 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013462 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013463 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13464 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13465 else
13466 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 break;
13468 case PyUnicode_2BYTE_KIND:
13469 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13470 break;
13471 case PyUnicode_4BYTE_KIND:
13472 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13473 break;
13474 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013475 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013476 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013477
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013478 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013479 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013480 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013481
13482 return out;
13483}
13484
INADA Naoki3ae20562017-01-16 20:41:20 +090013485/*[clinic input]
13486str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013487
INADA Naoki3ae20562017-01-16 20:41:20 +090013488 sep: object
13489 /
13490
13491Partition the string into three parts using the given separator.
13492
13493This will search for the separator in the string. If the separator is found,
13494returns a 3-tuple containing the part before the separator, the separator
13495itself, and the part after it.
13496
13497If the separator is not found, returns a 3-tuple containing the original string
13498and two empty strings.
13499[clinic start generated code]*/
13500
13501static PyObject *
13502unicode_partition(PyObject *self, PyObject *sep)
13503/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013504{
INADA Naoki3ae20562017-01-16 20:41:20 +090013505 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013506}
13507
INADA Naoki3ae20562017-01-16 20:41:20 +090013508/*[clinic input]
13509str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013510
INADA Naoki3ae20562017-01-16 20:41:20 +090013511Partition the string into three parts using the given separator.
13512
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013513This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013514the separator is found, returns a 3-tuple containing the part before the
13515separator, the separator itself, and the part after it.
13516
13517If the separator is not found, returns a 3-tuple containing two empty strings
13518and the original string.
13519[clinic start generated code]*/
13520
13521static PyObject *
13522unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013523/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013524{
INADA Naoki3ae20562017-01-16 20:41:20 +090013525 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013526}
13527
Alexander Belopolsky40018472011-02-26 01:02:56 +000013528PyObject *
13529PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013530{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013531 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013532 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013533
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013534 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013535}
13536
INADA Naoki3ae20562017-01-16 20:41:20 +090013537/*[clinic input]
13538str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013539
INADA Naoki3ae20562017-01-16 20:41:20 +090013540Return a list of the words in the string, using sep as the delimiter string.
13541
13542Splits are done starting at the end of the string and working to the front.
13543[clinic start generated code]*/
13544
13545static PyObject *
13546unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13547/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013548{
INADA Naoki3ae20562017-01-16 20:41:20 +090013549 if (sep == Py_None)
13550 return rsplit(self, NULL, maxsplit);
13551 if (PyUnicode_Check(sep))
13552 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013553
Victor Stinner998b8062018-09-12 00:23:25 +020013554 PyErr_Format(PyExc_TypeError,
13555 "must be str or None, not %.100s",
13556 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013557 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013558}
13559
INADA Naoki3ae20562017-01-16 20:41:20 +090013560/*[clinic input]
13561str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013562
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013563 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013564
13565Return a list of the lines in the string, breaking at line boundaries.
13566
13567Line breaks are not included in the resulting list unless keepends is given and
13568true.
13569[clinic start generated code]*/
13570
13571static PyObject *
13572unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013573/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013574{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013575 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576}
13577
13578static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013579PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013580{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013581 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582}
13583
INADA Naoki3ae20562017-01-16 20:41:20 +090013584/*[clinic input]
13585str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013586
INADA Naoki3ae20562017-01-16 20:41:20 +090013587Convert uppercase characters to lowercase and lowercase characters to uppercase.
13588[clinic start generated code]*/
13589
13590static PyObject *
13591unicode_swapcase_impl(PyObject *self)
13592/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013594 if (PyUnicode_READY(self) == -1)
13595 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013596 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013597}
13598
Larry Hastings61272b72014-01-07 12:41:53 -080013599/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013600
Larry Hastings31826802013-10-19 00:09:25 -070013601@staticmethod
13602str.maketrans as unicode_maketrans
13603
13604 x: object
13605
13606 y: unicode=NULL
13607
13608 z: unicode=NULL
13609
13610 /
13611
13612Return a translation table usable for str.translate().
13613
13614If there is only one argument, it must be a dictionary mapping Unicode
13615ordinals (integers) or characters to Unicode ordinals, strings or None.
13616Character keys will be then converted to ordinals.
13617If there are two arguments, they must be strings of equal length, and
13618in the resulting dictionary, each character in x will be mapped to the
13619character at the same position in y. If there is a third argument, it
13620must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013621[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013622
Larry Hastings31826802013-10-19 00:09:25 -070013623static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013624unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013625/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013626{
Georg Brandlceee0772007-11-27 23:48:05 +000013627 PyObject *new = NULL, *key, *value;
13628 Py_ssize_t i = 0;
13629 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013630
Georg Brandlceee0772007-11-27 23:48:05 +000013631 new = PyDict_New();
13632 if (!new)
13633 return NULL;
13634 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013635 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013636 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637
Georg Brandlceee0772007-11-27 23:48:05 +000013638 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013639 if (!PyUnicode_Check(x)) {
13640 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13641 "be a string if there is a second argument");
13642 goto err;
13643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013644 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013645 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13646 "arguments must have equal length");
13647 goto err;
13648 }
13649 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013650 x_kind = PyUnicode_KIND(x);
13651 y_kind = PyUnicode_KIND(y);
13652 x_data = PyUnicode_DATA(x);
13653 y_data = PyUnicode_DATA(y);
13654 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13655 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013656 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013657 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013658 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013659 if (!value) {
13660 Py_DECREF(key);
13661 goto err;
13662 }
Georg Brandlceee0772007-11-27 23:48:05 +000013663 res = PyDict_SetItem(new, key, value);
13664 Py_DECREF(key);
13665 Py_DECREF(value);
13666 if (res < 0)
13667 goto err;
13668 }
13669 /* create entries for deleting chars in z */
13670 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013671 z_kind = PyUnicode_KIND(z);
13672 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013673 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013675 if (!key)
13676 goto err;
13677 res = PyDict_SetItem(new, key, Py_None);
13678 Py_DECREF(key);
13679 if (res < 0)
13680 goto err;
13681 }
13682 }
13683 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013685 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013686
Georg Brandlceee0772007-11-27 23:48:05 +000013687 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013688 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013689 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13690 "to maketrans it must be a dict");
13691 goto err;
13692 }
13693 /* copy entries into the new dict, converting string keys to int keys */
13694 while (PyDict_Next(x, &i, &key, &value)) {
13695 if (PyUnicode_Check(key)) {
13696 /* convert string keys to integer keys */
13697 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013698 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013699 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13700 "table must be of length 1");
13701 goto err;
13702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013703 kind = PyUnicode_KIND(key);
13704 data = PyUnicode_DATA(key);
13705 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013706 if (!newkey)
13707 goto err;
13708 res = PyDict_SetItem(new, newkey, value);
13709 Py_DECREF(newkey);
13710 if (res < 0)
13711 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013712 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013713 /* just keep integer keys */
13714 if (PyDict_SetItem(new, key, value) < 0)
13715 goto err;
13716 } else {
13717 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13718 "be strings or integers");
13719 goto err;
13720 }
13721 }
13722 }
13723 return new;
13724 err:
13725 Py_DECREF(new);
13726 return NULL;
13727}
13728
INADA Naoki3ae20562017-01-16 20:41:20 +090013729/*[clinic input]
13730str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013731
INADA Naoki3ae20562017-01-16 20:41:20 +090013732 table: object
13733 Translation table, which must be a mapping of Unicode ordinals to
13734 Unicode ordinals, strings, or None.
13735 /
13736
13737Replace each character in the string using the given translation table.
13738
13739The table must implement lookup/indexing via __getitem__, for instance a
13740dictionary or list. If this operation raises LookupError, the character is
13741left untouched. Characters mapped to None are deleted.
13742[clinic start generated code]*/
13743
13744static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013746/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013749}
13750
INADA Naoki3ae20562017-01-16 20:41:20 +090013751/*[clinic input]
13752str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013753
INADA Naoki3ae20562017-01-16 20:41:20 +090013754Return a copy of the string converted to uppercase.
13755[clinic start generated code]*/
13756
13757static PyObject *
13758unicode_upper_impl(PyObject *self)
13759/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013760{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013761 if (PyUnicode_READY(self) == -1)
13762 return NULL;
13763 if (PyUnicode_IS_ASCII(self))
13764 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013765 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013766}
13767
INADA Naoki3ae20562017-01-16 20:41:20 +090013768/*[clinic input]
13769str.zfill as unicode_zfill
13770
13771 width: Py_ssize_t
13772 /
13773
13774Pad a numeric string with zeros on the left, to fill a field of the given width.
13775
13776The string is never truncated.
13777[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778
13779static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013780unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013781/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013783 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013784 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013785 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013786 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 Py_UCS4 chr;
13788
Benjamin Petersonbac79492012-01-14 13:34:47 -050013789 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791
Victor Stinnerc4b49542011-12-11 22:44:26 +010013792 if (PyUnicode_GET_LENGTH(self) >= width)
13793 return unicode_result_unchanged(self);
13794
13795 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013796
13797 u = pad(self, fill, 0, '0');
13798
Walter Dörwald068325e2002-04-15 13:36:47 +000013799 if (u == NULL)
13800 return NULL;
13801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013802 kind = PyUnicode_KIND(u);
13803 data = PyUnicode_DATA(u);
13804 chr = PyUnicode_READ(kind, data, fill);
13805
13806 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808 PyUnicode_WRITE(kind, data, 0, chr);
13809 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013810 }
13811
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013812 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013813 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013815
13816#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013817static PyObject *
13818unicode__decimal2ascii(PyObject *self)
13819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013820 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013821}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822#endif
13823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013824PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013826\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013827Return True if S starts with the specified prefix, False otherwise.\n\
13828With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013829With optional end, stop comparing S at that position.\n\
13830prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013831
13832static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013833unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013834 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013836 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013837 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013838 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013839 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013840 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013841
Jesus Ceaac451502011-04-20 17:09:23 +020013842 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013844 if (PyTuple_Check(subobj)) {
13845 Py_ssize_t i;
13846 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013847 substring = PyTuple_GET_ITEM(subobj, i);
13848 if (!PyUnicode_Check(substring)) {
13849 PyErr_Format(PyExc_TypeError,
13850 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013851 "not %.100s",
13852 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013853 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013854 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013855 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013856 if (result == -1)
13857 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013858 if (result) {
13859 Py_RETURN_TRUE;
13860 }
13861 }
13862 /* nothing matched */
13863 Py_RETURN_FALSE;
13864 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013865 if (!PyUnicode_Check(subobj)) {
13866 PyErr_Format(PyExc_TypeError,
13867 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013868 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013870 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013871 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013872 if (result == -1)
13873 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013874 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013875}
13876
13877
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013878PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013879 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013880\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013881Return True if S ends with the specified suffix, False otherwise.\n\
13882With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013883With optional end, stop comparing S at that position.\n\
13884suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013885
13886static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013887unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013889{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013890 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013891 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013892 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013893 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013894 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895
Jesus Ceaac451502011-04-20 17:09:23 +020013896 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013897 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013898 if (PyTuple_Check(subobj)) {
13899 Py_ssize_t i;
13900 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013901 substring = PyTuple_GET_ITEM(subobj, i);
13902 if (!PyUnicode_Check(substring)) {
13903 PyErr_Format(PyExc_TypeError,
13904 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013905 "not %.100s",
13906 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013907 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013908 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013909 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013910 if (result == -1)
13911 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013912 if (result) {
13913 Py_RETURN_TRUE;
13914 }
13915 }
13916 Py_RETURN_FALSE;
13917 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013918 if (!PyUnicode_Check(subobj)) {
13919 PyErr_Format(PyExc_TypeError,
13920 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013921 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013923 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013924 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013925 if (result == -1)
13926 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013927 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013928}
13929
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013930static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013931_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013932{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013933 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13934 writer->data = PyUnicode_DATA(writer->buffer);
13935
13936 if (!writer->readonly) {
13937 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013938 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013939 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013940 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013941 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13942 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13943 writer->kind = PyUnicode_WCHAR_KIND;
13944 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13945
Victor Stinner8f674cc2013-04-17 23:02:17 +020013946 /* Copy-on-write mode: set buffer size to 0 so
13947 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13948 * next write. */
13949 writer->size = 0;
13950 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013951}
13952
Victor Stinnerd3f08822012-05-29 12:57:52 +020013953void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013954_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013955{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013956 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013957
13958 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013959 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013960
13961 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13962 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13963 writer->kind = PyUnicode_WCHAR_KIND;
13964 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013965}
13966
Inada Naoki770847a2019-06-24 12:30:24 +090013967// Initialize _PyUnicodeWriter with initial buffer
13968static inline void
13969_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13970{
13971 memset(writer, 0, sizeof(*writer));
13972 writer->buffer = buffer;
13973 _PyUnicodeWriter_Update(writer);
13974 writer->min_length = writer->size;
13975}
13976
Victor Stinnerd3f08822012-05-29 12:57:52 +020013977int
13978_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13979 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013980{
13981 Py_ssize_t newlen;
13982 PyObject *newbuffer;
13983
Victor Stinner2740e462016-09-06 16:58:36 -070013984 assert(maxchar <= MAX_UNICODE);
13985
Victor Stinnerca9381e2015-09-22 00:58:32 +020013986 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013987 assert((maxchar > writer->maxchar && length >= 0)
13988 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013989
Victor Stinner202fdca2012-05-07 12:47:02 +020013990 if (length > PY_SSIZE_T_MAX - writer->pos) {
13991 PyErr_NoMemory();
13992 return -1;
13993 }
13994 newlen = writer->pos + length;
13995
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013996 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013997
Victor Stinnerd3f08822012-05-29 12:57:52 +020013998 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013999 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014000 if (writer->overallocate
14001 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14002 /* overallocate to limit the number of realloc() */
14003 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014004 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014005 if (newlen < writer->min_length)
14006 newlen = writer->min_length;
14007
Victor Stinnerd3f08822012-05-29 12:57:52 +020014008 writer->buffer = PyUnicode_New(newlen, maxchar);
14009 if (writer->buffer == NULL)
14010 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014011 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014012 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014013 if (writer->overallocate
14014 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14015 /* overallocate to limit the number of realloc() */
14016 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014017 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014018 if (newlen < writer->min_length)
14019 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014020
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014021 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014022 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014023 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014024 newbuffer = PyUnicode_New(newlen, maxchar);
14025 if (newbuffer == NULL)
14026 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014027 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14028 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014029 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014030 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014031 }
14032 else {
14033 newbuffer = resize_compact(writer->buffer, newlen);
14034 if (newbuffer == NULL)
14035 return -1;
14036 }
14037 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014038 }
14039 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014040 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014041 newbuffer = PyUnicode_New(writer->size, maxchar);
14042 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014043 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014044 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14045 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014046 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014047 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014048 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014049 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014050
14051#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014052}
14053
Victor Stinnerca9381e2015-09-22 00:58:32 +020014054int
14055_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14056 enum PyUnicode_Kind kind)
14057{
14058 Py_UCS4 maxchar;
14059
14060 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14061 assert(writer->kind < kind);
14062
14063 switch (kind)
14064 {
14065 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14066 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14067 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14068 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014069 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014070 }
14071
14072 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14073}
14074
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014075static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014076_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014077{
Victor Stinner2740e462016-09-06 16:58:36 -070014078 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014079 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14080 return -1;
14081 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14082 writer->pos++;
14083 return 0;
14084}
14085
14086int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014087_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14088{
14089 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14090}
14091
14092int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014093_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14094{
14095 Py_UCS4 maxchar;
14096 Py_ssize_t len;
14097
14098 if (PyUnicode_READY(str) == -1)
14099 return -1;
14100 len = PyUnicode_GET_LENGTH(str);
14101 if (len == 0)
14102 return 0;
14103 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14104 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014105 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014106 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014107 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014108 Py_INCREF(str);
14109 writer->buffer = str;
14110 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014111 writer->pos += len;
14112 return 0;
14113 }
14114 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14115 return -1;
14116 }
14117 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14118 str, 0, len);
14119 writer->pos += len;
14120 return 0;
14121}
14122
Victor Stinnere215d962012-10-06 23:03:36 +020014123int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014124_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14125 Py_ssize_t start, Py_ssize_t end)
14126{
14127 Py_UCS4 maxchar;
14128 Py_ssize_t len;
14129
14130 if (PyUnicode_READY(str) == -1)
14131 return -1;
14132
14133 assert(0 <= start);
14134 assert(end <= PyUnicode_GET_LENGTH(str));
14135 assert(start <= end);
14136
14137 if (end == 0)
14138 return 0;
14139
14140 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14141 return _PyUnicodeWriter_WriteStr(writer, str);
14142
14143 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14144 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14145 else
14146 maxchar = writer->maxchar;
14147 len = end - start;
14148
14149 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14150 return -1;
14151
14152 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14153 str, start, len);
14154 writer->pos += len;
14155 return 0;
14156}
14157
14158int
Victor Stinner4a587072013-11-19 12:54:53 +010014159_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14160 const char *ascii, Py_ssize_t len)
14161{
14162 if (len == -1)
14163 len = strlen(ascii);
14164
Andy Lestere6be9b52020-02-11 20:28:35 -060014165 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014166
14167 if (writer->buffer == NULL && !writer->overallocate) {
14168 PyObject *str;
14169
14170 str = _PyUnicode_FromASCII(ascii, len);
14171 if (str == NULL)
14172 return -1;
14173
14174 writer->readonly = 1;
14175 writer->buffer = str;
14176 _PyUnicodeWriter_Update(writer);
14177 writer->pos += len;
14178 return 0;
14179 }
14180
14181 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14182 return -1;
14183
14184 switch (writer->kind)
14185 {
14186 case PyUnicode_1BYTE_KIND:
14187 {
14188 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14189 Py_UCS1 *data = writer->data;
14190
Christian Heimesf051e432016-09-13 20:22:02 +020014191 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014192 break;
14193 }
14194 case PyUnicode_2BYTE_KIND:
14195 {
14196 _PyUnicode_CONVERT_BYTES(
14197 Py_UCS1, Py_UCS2,
14198 ascii, ascii + len,
14199 (Py_UCS2 *)writer->data + writer->pos);
14200 break;
14201 }
14202 case PyUnicode_4BYTE_KIND:
14203 {
14204 _PyUnicode_CONVERT_BYTES(
14205 Py_UCS1, Py_UCS4,
14206 ascii, ascii + len,
14207 (Py_UCS4 *)writer->data + writer->pos);
14208 break;
14209 }
14210 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014211 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014212 }
14213
14214 writer->pos += len;
14215 return 0;
14216}
14217
14218int
14219_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14220 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014221{
14222 Py_UCS4 maxchar;
14223
Andy Lestere6be9b52020-02-11 20:28:35 -060014224 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014225 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14226 return -1;
14227 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14228 writer->pos += len;
14229 return 0;
14230}
14231
Victor Stinnerd3f08822012-05-29 12:57:52 +020014232PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014233_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014234{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014235 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014236
Victor Stinnerd3f08822012-05-29 12:57:52 +020014237 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014238 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014239 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014240 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014241
14242 str = writer->buffer;
14243 writer->buffer = NULL;
14244
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014245 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014246 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14247 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014248 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014249
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014250 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14251 PyObject *str2;
14252 str2 = resize_compact(str, writer->pos);
14253 if (str2 == NULL) {
14254 Py_DECREF(str);
14255 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014256 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014257 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014258 }
14259
Victor Stinner15a0bd32013-07-08 22:29:55 +020014260 assert(_PyUnicode_CheckConsistency(str, 1));
14261 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014262}
14263
Victor Stinnerd3f08822012-05-29 12:57:52 +020014264void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014265_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014266{
14267 Py_CLEAR(writer->buffer);
14268}
14269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014270#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014271
14272PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014273 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014274\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014275Return a formatted version of S, using substitutions from args and kwargs.\n\
14276The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014277
Eric Smith27bbca62010-11-04 17:06:58 +000014278PyDoc_STRVAR(format_map__doc__,
14279 "S.format_map(mapping) -> str\n\
14280\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014281Return a formatted version of S, using substitutions from mapping.\n\
14282The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014283
INADA Naoki3ae20562017-01-16 20:41:20 +090014284/*[clinic input]
14285str.__format__ as unicode___format__
14286
14287 format_spec: unicode
14288 /
14289
14290Return a formatted version of the string as described by format_spec.
14291[clinic start generated code]*/
14292
Eric Smith4a7d76d2008-05-30 18:10:19 +000014293static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014294unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014295/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014296{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014297 _PyUnicodeWriter writer;
14298 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014299
Victor Stinnerd3f08822012-05-29 12:57:52 +020014300 if (PyUnicode_READY(self) == -1)
14301 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014302 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014303 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14304 self, format_spec, 0,
14305 PyUnicode_GET_LENGTH(format_spec));
14306 if (ret == -1) {
14307 _PyUnicodeWriter_Dealloc(&writer);
14308 return NULL;
14309 }
14310 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014311}
14312
INADA Naoki3ae20562017-01-16 20:41:20 +090014313/*[clinic input]
14314str.__sizeof__ as unicode_sizeof
14315
14316Return the size of the string in memory, in bytes.
14317[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014318
14319static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014320unicode_sizeof_impl(PyObject *self)
14321/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014323 Py_ssize_t size;
14324
14325 /* If it's a compact object, account for base structure +
14326 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014327 if (PyUnicode_IS_COMPACT_ASCII(self))
14328 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14329 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014330 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014331 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014332 else {
14333 /* If it is a two-block object, account for base object, and
14334 for character block if present. */
14335 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014336 if (_PyUnicode_DATA_ANY(self))
14337 size += (PyUnicode_GET_LENGTH(self) + 1) *
14338 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014339 }
14340 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014341 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014342 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14343 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14344 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14345 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014346
14347 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014348}
14349
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014350static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014351unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014352{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014353 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014354 if (!copy)
14355 return NULL;
14356 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014357}
14358
Guido van Rossumd57fd912000-03-10 22:53:23 +000014359static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014360 UNICODE_ENCODE_METHODDEF
14361 UNICODE_REPLACE_METHODDEF
14362 UNICODE_SPLIT_METHODDEF
14363 UNICODE_RSPLIT_METHODDEF
14364 UNICODE_JOIN_METHODDEF
14365 UNICODE_CAPITALIZE_METHODDEF
14366 UNICODE_CASEFOLD_METHODDEF
14367 UNICODE_TITLE_METHODDEF
14368 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014369 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014370 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014371 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014372 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014373 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014374 UNICODE_LJUST_METHODDEF
14375 UNICODE_LOWER_METHODDEF
14376 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014377 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14378 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014379 UNICODE_RJUST_METHODDEF
14380 UNICODE_RSTRIP_METHODDEF
14381 UNICODE_RPARTITION_METHODDEF
14382 UNICODE_SPLITLINES_METHODDEF
14383 UNICODE_STRIP_METHODDEF
14384 UNICODE_SWAPCASE_METHODDEF
14385 UNICODE_TRANSLATE_METHODDEF
14386 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014387 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14388 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014389 UNICODE_REMOVEPREFIX_METHODDEF
14390 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014391 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014392 UNICODE_ISLOWER_METHODDEF
14393 UNICODE_ISUPPER_METHODDEF
14394 UNICODE_ISTITLE_METHODDEF
14395 UNICODE_ISSPACE_METHODDEF
14396 UNICODE_ISDECIMAL_METHODDEF
14397 UNICODE_ISDIGIT_METHODDEF
14398 UNICODE_ISNUMERIC_METHODDEF
14399 UNICODE_ISALPHA_METHODDEF
14400 UNICODE_ISALNUM_METHODDEF
14401 UNICODE_ISIDENTIFIER_METHODDEF
14402 UNICODE_ISPRINTABLE_METHODDEF
14403 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014404 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014405 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014406 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014407 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014408 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014409#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014410 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014411 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014412#endif
14413
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014414 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014415 {NULL, NULL}
14416};
14417
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014418static PyObject *
14419unicode_mod(PyObject *v, PyObject *w)
14420{
Brian Curtindfc80e32011-08-10 20:28:54 -050014421 if (!PyUnicode_Check(v))
14422 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014423 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014424}
14425
14426static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014427 0, /*nb_add*/
14428 0, /*nb_subtract*/
14429 0, /*nb_multiply*/
14430 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014431};
14432
Guido van Rossumd57fd912000-03-10 22:53:23 +000014433static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014434 (lenfunc) unicode_length, /* sq_length */
14435 PyUnicode_Concat, /* sq_concat */
14436 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14437 (ssizeargfunc) unicode_getitem, /* sq_item */
14438 0, /* sq_slice */
14439 0, /* sq_ass_item */
14440 0, /* sq_ass_slice */
14441 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014442};
14443
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014444static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014445unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014447 if (PyUnicode_READY(self) == -1)
14448 return NULL;
14449
Victor Stinnera15e2602020-04-08 02:01:56 +020014450 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014451 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014452 if (i == -1 && PyErr_Occurred())
14453 return NULL;
14454 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014455 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014456 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014457 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014458 Py_ssize_t start, stop, step, slicelength, i;
14459 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014460 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014461 const void *src_data;
14462 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014463 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014464 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014465
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014466 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014467 return NULL;
14468 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014469 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14470 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014471
14472 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014473 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014474 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014475 slicelength == PyUnicode_GET_LENGTH(self)) {
14476 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014477 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014478 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014479 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014480 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014481 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014482 src_kind = PyUnicode_KIND(self);
14483 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014484 if (!PyUnicode_IS_ASCII(self)) {
14485 kind_limit = kind_maxchar_limit(src_kind);
14486 max_char = 0;
14487 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14488 ch = PyUnicode_READ(src_kind, src_data, cur);
14489 if (ch > max_char) {
14490 max_char = ch;
14491 if (max_char >= kind_limit)
14492 break;
14493 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014494 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014495 }
Victor Stinner55c99112011-10-13 01:17:06 +020014496 else
14497 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014498 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014499 if (result == NULL)
14500 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014501 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014502 dest_data = PyUnicode_DATA(result);
14503
14504 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014505 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14506 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014507 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014508 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014509 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014510 } else {
14511 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14512 return NULL;
14513 }
14514}
14515
14516static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014517 (lenfunc)unicode_length, /* mp_length */
14518 (binaryfunc)unicode_subscript, /* mp_subscript */
14519 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014520};
14521
Guido van Rossumd57fd912000-03-10 22:53:23 +000014522
Guido van Rossumd57fd912000-03-10 22:53:23 +000014523/* Helpers for PyUnicode_Format() */
14524
Victor Stinnera47082312012-10-04 02:19:54 +020014525struct unicode_formatter_t {
14526 PyObject *args;
14527 int args_owned;
14528 Py_ssize_t arglen, argidx;
14529 PyObject *dict;
14530
14531 enum PyUnicode_Kind fmtkind;
14532 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014533 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014534 PyObject *fmtstr;
14535
14536 _PyUnicodeWriter writer;
14537};
14538
14539struct unicode_format_arg_t {
14540 Py_UCS4 ch;
14541 int flags;
14542 Py_ssize_t width;
14543 int prec;
14544 int sign;
14545};
14546
Guido van Rossumd57fd912000-03-10 22:53:23 +000014547static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014548unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014549{
Victor Stinnera47082312012-10-04 02:19:54 +020014550 Py_ssize_t argidx = ctx->argidx;
14551
14552 if (argidx < ctx->arglen) {
14553 ctx->argidx++;
14554 if (ctx->arglen < 0)
14555 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014556 else
Victor Stinnera47082312012-10-04 02:19:54 +020014557 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014558 }
14559 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014560 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014561 return NULL;
14562}
14563
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014564/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014565
Victor Stinnera47082312012-10-04 02:19:54 +020014566/* Format a float into the writer if the writer is not NULL, or into *p_output
14567 otherwise.
14568
14569 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014570static int
Victor Stinnera47082312012-10-04 02:19:54 +020014571formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14572 PyObject **p_output,
14573 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014574{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014575 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014576 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014577 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014578 int prec;
14579 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014580
Guido van Rossumd57fd912000-03-10 22:53:23 +000014581 x = PyFloat_AsDouble(v);
14582 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014583 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014584
Victor Stinnera47082312012-10-04 02:19:54 +020014585 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014586 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014587 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014588
Victor Stinnera47082312012-10-04 02:19:54 +020014589 if (arg->flags & F_ALT)
14590 dtoa_flags = Py_DTSF_ALT;
14591 else
14592 dtoa_flags = 0;
14593 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014594 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014595 return -1;
14596 len = strlen(p);
14597 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014598 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014599 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014600 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014601 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014602 }
14603 else
14604 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014605 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014606 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014607}
14608
Victor Stinnerd0880d52012-04-27 23:40:13 +020014609/* formatlong() emulates the format codes d, u, o, x and X, and
14610 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14611 * Python's regular ints.
14612 * Return value: a new PyUnicodeObject*, or NULL if error.
14613 * The output string is of the form
14614 * "-"? ("0x" | "0X")? digit+
14615 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14616 * set in flags. The case of hex digits will be correct,
14617 * There will be at least prec digits, zero-filled on the left if
14618 * necessary to get that many.
14619 * val object to be converted
14620 * flags bitmask of format flags; only F_ALT is looked at
14621 * prec minimum number of digits; 0-fill on left if needed
14622 * type a character in [duoxX]; u acts the same as d
14623 *
14624 * CAUTION: o, x and X conversions on regular ints can never
14625 * produce a '-' sign, but can for Python's unbounded ints.
14626 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014627PyObject *
14628_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014629{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014630 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014631 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014632 Py_ssize_t i;
14633 int sign; /* 1 if '-', else 0 */
14634 int len; /* number of characters */
14635 Py_ssize_t llen;
14636 int numdigits; /* len == numnondigits + numdigits */
14637 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014638
Victor Stinnerd0880d52012-04-27 23:40:13 +020014639 /* Avoid exceeding SSIZE_T_MAX */
14640 if (prec > INT_MAX-3) {
14641 PyErr_SetString(PyExc_OverflowError,
14642 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014643 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014644 }
14645
14646 assert(PyLong_Check(val));
14647
14648 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014649 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014650 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014651 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014652 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014653 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014654 /* int and int subclasses should print numerically when a numeric */
14655 /* format code is used (see issue18780) */
14656 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014657 break;
14658 case 'o':
14659 numnondigits = 2;
14660 result = PyNumber_ToBase(val, 8);
14661 break;
14662 case 'x':
14663 case 'X':
14664 numnondigits = 2;
14665 result = PyNumber_ToBase(val, 16);
14666 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014667 }
14668 if (!result)
14669 return NULL;
14670
14671 assert(unicode_modifiable(result));
14672 assert(PyUnicode_IS_READY(result));
14673 assert(PyUnicode_IS_ASCII(result));
14674
14675 /* To modify the string in-place, there can only be one reference. */
14676 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014677 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014678 PyErr_BadInternalCall();
14679 return NULL;
14680 }
14681 buf = PyUnicode_DATA(result);
14682 llen = PyUnicode_GET_LENGTH(result);
14683 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014684 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014685 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014686 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014687 return NULL;
14688 }
14689 len = (int)llen;
14690 sign = buf[0] == '-';
14691 numnondigits += sign;
14692 numdigits = len - numnondigits;
14693 assert(numdigits > 0);
14694
14695 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014696 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014697 (type == 'o' || type == 'x' || type == 'X'))) {
14698 assert(buf[sign] == '0');
14699 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14700 buf[sign+1] == 'o');
14701 numnondigits -= 2;
14702 buf += 2;
14703 len -= 2;
14704 if (sign)
14705 buf[0] = '-';
14706 assert(len == numnondigits + numdigits);
14707 assert(numdigits > 0);
14708 }
14709
14710 /* Fill with leading zeroes to meet minimum width. */
14711 if (prec > numdigits) {
14712 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14713 numnondigits + prec);
14714 char *b1;
14715 if (!r1) {
14716 Py_DECREF(result);
14717 return NULL;
14718 }
14719 b1 = PyBytes_AS_STRING(r1);
14720 for (i = 0; i < numnondigits; ++i)
14721 *b1++ = *buf++;
14722 for (i = 0; i < prec - numdigits; i++)
14723 *b1++ = '0';
14724 for (i = 0; i < numdigits; i++)
14725 *b1++ = *buf++;
14726 *b1 = '\0';
14727 Py_DECREF(result);
14728 result = r1;
14729 buf = PyBytes_AS_STRING(result);
14730 len = numnondigits + prec;
14731 }
14732
14733 /* Fix up case for hex conversions. */
14734 if (type == 'X') {
14735 /* Need to convert all lower case letters to upper case.
14736 and need to convert 0x to 0X (and -0x to -0X). */
14737 for (i = 0; i < len; i++)
14738 if (buf[i] >= 'a' && buf[i] <= 'x')
14739 buf[i] -= 'a'-'A';
14740 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014741 if (!PyUnicode_Check(result)
14742 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014743 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014744 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014745 Py_DECREF(result);
14746 result = unicode;
14747 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014748 else if (len != PyUnicode_GET_LENGTH(result)) {
14749 if (PyUnicode_Resize(&result, len) < 0)
14750 Py_CLEAR(result);
14751 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014752 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014753}
14754
Ethan Furmandf3ed242014-01-05 06:50:30 -080014755/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014756 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014757 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014758 * -1 and raise an exception on error */
14759static int
Victor Stinnera47082312012-10-04 02:19:54 +020014760mainformatlong(PyObject *v,
14761 struct unicode_format_arg_t *arg,
14762 PyObject **p_output,
14763 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014764{
14765 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014766 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014767
14768 if (!PyNumber_Check(v))
14769 goto wrongtype;
14770
Ethan Furman9ab74802014-03-21 06:38:46 -070014771 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014772 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014773 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014774 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014775 }
14776 else {
14777 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014778 }
14779 if (iobj == NULL ) {
14780 if (PyErr_ExceptionMatches(PyExc_TypeError))
14781 goto wrongtype;
14782 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014783 }
14784 assert(PyLong_Check(iobj));
14785 }
14786 else {
14787 iobj = v;
14788 Py_INCREF(iobj);
14789 }
14790
14791 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014792 && arg->width == -1 && arg->prec == -1
14793 && !(arg->flags & (F_SIGN | F_BLANK))
14794 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014795 {
14796 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014797 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014798 int base;
14799
Victor Stinnera47082312012-10-04 02:19:54 +020014800 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014801 {
14802 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014803 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014804 case 'd':
14805 case 'i':
14806 case 'u':
14807 base = 10;
14808 break;
14809 case 'o':
14810 base = 8;
14811 break;
14812 case 'x':
14813 case 'X':
14814 base = 16;
14815 break;
14816 }
14817
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014818 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14819 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014820 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014821 }
14822 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014823 return 1;
14824 }
14825
Ethan Furmanb95b5612015-01-23 20:05:18 -080014826 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014827 Py_DECREF(iobj);
14828 if (res == NULL)
14829 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014830 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014831 return 0;
14832
14833wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014834 switch(type)
14835 {
14836 case 'o':
14837 case 'x':
14838 case 'X':
14839 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014840 "%%%c format: an integer is required, "
14841 "not %.200s",
14842 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014843 break;
14844 default:
14845 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014846 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014847 "not %.200s",
14848 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014849 break;
14850 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014851 return -1;
14852}
14853
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014854static Py_UCS4
14855formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014856{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014857 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014858 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014859 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014860 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014861 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014862 goto onError;
14863 }
14864 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014865 int overflow;
14866 long x = PyLong_AsLongAndOverflow(v, &overflow);
14867 if (x == -1 && PyErr_Occurred()) {
14868 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014869 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014870 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014871 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014872 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014873
Victor Stinner8faf8212011-12-08 22:14:11 +010014874 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014875 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014876 PyErr_SetString(PyExc_OverflowError,
14877 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014878 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014879 }
14880
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014881 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014882 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014883
Benjamin Peterson29060642009-01-31 22:14:21 +000014884 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014885 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014886 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014887 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014888}
14889
Victor Stinnera47082312012-10-04 02:19:54 +020014890/* Parse options of an argument: flags, width, precision.
14891 Handle also "%(name)" syntax.
14892
14893 Return 0 if the argument has been formatted into arg->str.
14894 Return 1 if the argument has been written into ctx->writer,
14895 Raise an exception and return -1 on error. */
14896static int
14897unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14898 struct unicode_format_arg_t *arg)
14899{
14900#define FORMAT_READ(ctx) \
14901 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14902
14903 PyObject *v;
14904
Victor Stinnera47082312012-10-04 02:19:54 +020014905 if (arg->ch == '(') {
14906 /* Get argument value from a dictionary. Example: "%(name)s". */
14907 Py_ssize_t keystart;
14908 Py_ssize_t keylen;
14909 PyObject *key;
14910 int pcount = 1;
14911
14912 if (ctx->dict == NULL) {
14913 PyErr_SetString(PyExc_TypeError,
14914 "format requires a mapping");
14915 return -1;
14916 }
14917 ++ctx->fmtpos;
14918 --ctx->fmtcnt;
14919 keystart = ctx->fmtpos;
14920 /* Skip over balanced parentheses */
14921 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14922 arg->ch = FORMAT_READ(ctx);
14923 if (arg->ch == ')')
14924 --pcount;
14925 else if (arg->ch == '(')
14926 ++pcount;
14927 ctx->fmtpos++;
14928 }
14929 keylen = ctx->fmtpos - keystart - 1;
14930 if (ctx->fmtcnt < 0 || pcount > 0) {
14931 PyErr_SetString(PyExc_ValueError,
14932 "incomplete format key");
14933 return -1;
14934 }
14935 key = PyUnicode_Substring(ctx->fmtstr,
14936 keystart, keystart + keylen);
14937 if (key == NULL)
14938 return -1;
14939 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014940 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014941 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014942 }
14943 ctx->args = PyObject_GetItem(ctx->dict, key);
14944 Py_DECREF(key);
14945 if (ctx->args == NULL)
14946 return -1;
14947 ctx->args_owned = 1;
14948 ctx->arglen = -1;
14949 ctx->argidx = -2;
14950 }
14951
14952 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014953 while (--ctx->fmtcnt >= 0) {
14954 arg->ch = FORMAT_READ(ctx);
14955 ctx->fmtpos++;
14956 switch (arg->ch) {
14957 case '-': arg->flags |= F_LJUST; continue;
14958 case '+': arg->flags |= F_SIGN; continue;
14959 case ' ': arg->flags |= F_BLANK; continue;
14960 case '#': arg->flags |= F_ALT; continue;
14961 case '0': arg->flags |= F_ZERO; continue;
14962 }
14963 break;
14964 }
14965
14966 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014967 if (arg->ch == '*') {
14968 v = unicode_format_getnextarg(ctx);
14969 if (v == NULL)
14970 return -1;
14971 if (!PyLong_Check(v)) {
14972 PyErr_SetString(PyExc_TypeError,
14973 "* wants int");
14974 return -1;
14975 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014976 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014977 if (arg->width == -1 && PyErr_Occurred())
14978 return -1;
14979 if (arg->width < 0) {
14980 arg->flags |= F_LJUST;
14981 arg->width = -arg->width;
14982 }
14983 if (--ctx->fmtcnt >= 0) {
14984 arg->ch = FORMAT_READ(ctx);
14985 ctx->fmtpos++;
14986 }
14987 }
14988 else if (arg->ch >= '0' && arg->ch <= '9') {
14989 arg->width = arg->ch - '0';
14990 while (--ctx->fmtcnt >= 0) {
14991 arg->ch = FORMAT_READ(ctx);
14992 ctx->fmtpos++;
14993 if (arg->ch < '0' || arg->ch > '9')
14994 break;
14995 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14996 mixing signed and unsigned comparison. Since arg->ch is between
14997 '0' and '9', casting to int is safe. */
14998 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14999 PyErr_SetString(PyExc_ValueError,
15000 "width too big");
15001 return -1;
15002 }
15003 arg->width = arg->width*10 + (arg->ch - '0');
15004 }
15005 }
15006
15007 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015008 if (arg->ch == '.') {
15009 arg->prec = 0;
15010 if (--ctx->fmtcnt >= 0) {
15011 arg->ch = FORMAT_READ(ctx);
15012 ctx->fmtpos++;
15013 }
15014 if (arg->ch == '*') {
15015 v = unicode_format_getnextarg(ctx);
15016 if (v == NULL)
15017 return -1;
15018 if (!PyLong_Check(v)) {
15019 PyErr_SetString(PyExc_TypeError,
15020 "* wants int");
15021 return -1;
15022 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015023 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015024 if (arg->prec == -1 && PyErr_Occurred())
15025 return -1;
15026 if (arg->prec < 0)
15027 arg->prec = 0;
15028 if (--ctx->fmtcnt >= 0) {
15029 arg->ch = FORMAT_READ(ctx);
15030 ctx->fmtpos++;
15031 }
15032 }
15033 else if (arg->ch >= '0' && arg->ch <= '9') {
15034 arg->prec = arg->ch - '0';
15035 while (--ctx->fmtcnt >= 0) {
15036 arg->ch = FORMAT_READ(ctx);
15037 ctx->fmtpos++;
15038 if (arg->ch < '0' || arg->ch > '9')
15039 break;
15040 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15041 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015042 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015043 return -1;
15044 }
15045 arg->prec = arg->prec*10 + (arg->ch - '0');
15046 }
15047 }
15048 }
15049
15050 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15051 if (ctx->fmtcnt >= 0) {
15052 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15053 if (--ctx->fmtcnt >= 0) {
15054 arg->ch = FORMAT_READ(ctx);
15055 ctx->fmtpos++;
15056 }
15057 }
15058 }
15059 if (ctx->fmtcnt < 0) {
15060 PyErr_SetString(PyExc_ValueError,
15061 "incomplete format");
15062 return -1;
15063 }
15064 return 0;
15065
15066#undef FORMAT_READ
15067}
15068
15069/* Format one argument. Supported conversion specifiers:
15070
15071 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015072 - "i", "d", "u": int or float
15073 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015074 - "e", "E", "f", "F", "g", "G": float
15075 - "c": int or str (1 character)
15076
Victor Stinner8dbd4212012-12-04 09:30:24 +010015077 When possible, the output is written directly into the Unicode writer
15078 (ctx->writer). A string is created when padding is required.
15079
Victor Stinnera47082312012-10-04 02:19:54 +020015080 Return 0 if the argument has been formatted into *p_str,
15081 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015082 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015083static int
15084unicode_format_arg_format(struct unicode_formatter_t *ctx,
15085 struct unicode_format_arg_t *arg,
15086 PyObject **p_str)
15087{
15088 PyObject *v;
15089 _PyUnicodeWriter *writer = &ctx->writer;
15090
15091 if (ctx->fmtcnt == 0)
15092 ctx->writer.overallocate = 0;
15093
Victor Stinnera47082312012-10-04 02:19:54 +020015094 v = unicode_format_getnextarg(ctx);
15095 if (v == NULL)
15096 return -1;
15097
Victor Stinnera47082312012-10-04 02:19:54 +020015098
15099 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015100 case 's':
15101 case 'r':
15102 case 'a':
15103 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15104 /* Fast path */
15105 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15106 return -1;
15107 return 1;
15108 }
15109
15110 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15111 *p_str = v;
15112 Py_INCREF(*p_str);
15113 }
15114 else {
15115 if (arg->ch == 's')
15116 *p_str = PyObject_Str(v);
15117 else if (arg->ch == 'r')
15118 *p_str = PyObject_Repr(v);
15119 else
15120 *p_str = PyObject_ASCII(v);
15121 }
15122 break;
15123
15124 case 'i':
15125 case 'd':
15126 case 'u':
15127 case 'o':
15128 case 'x':
15129 case 'X':
15130 {
15131 int ret = mainformatlong(v, arg, p_str, writer);
15132 if (ret != 0)
15133 return ret;
15134 arg->sign = 1;
15135 break;
15136 }
15137
15138 case 'e':
15139 case 'E':
15140 case 'f':
15141 case 'F':
15142 case 'g':
15143 case 'G':
15144 if (arg->width == -1 && arg->prec == -1
15145 && !(arg->flags & (F_SIGN | F_BLANK)))
15146 {
15147 /* Fast path */
15148 if (formatfloat(v, arg, NULL, writer) == -1)
15149 return -1;
15150 return 1;
15151 }
15152
15153 arg->sign = 1;
15154 if (formatfloat(v, arg, p_str, NULL) == -1)
15155 return -1;
15156 break;
15157
15158 case 'c':
15159 {
15160 Py_UCS4 ch = formatchar(v);
15161 if (ch == (Py_UCS4) -1)
15162 return -1;
15163 if (arg->width == -1 && arg->prec == -1) {
15164 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015165 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015166 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015167 return 1;
15168 }
15169 *p_str = PyUnicode_FromOrdinal(ch);
15170 break;
15171 }
15172
15173 default:
15174 PyErr_Format(PyExc_ValueError,
15175 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015176 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015177 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15178 (int)arg->ch,
15179 ctx->fmtpos - 1);
15180 return -1;
15181 }
15182 if (*p_str == NULL)
15183 return -1;
15184 assert (PyUnicode_Check(*p_str));
15185 return 0;
15186}
15187
15188static int
15189unicode_format_arg_output(struct unicode_formatter_t *ctx,
15190 struct unicode_format_arg_t *arg,
15191 PyObject *str)
15192{
15193 Py_ssize_t len;
15194 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015195 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015196 Py_ssize_t pindex;
15197 Py_UCS4 signchar;
15198 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015199 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015200 Py_ssize_t sublen;
15201 _PyUnicodeWriter *writer = &ctx->writer;
15202 Py_UCS4 fill;
15203
15204 fill = ' ';
15205 if (arg->sign && arg->flags & F_ZERO)
15206 fill = '0';
15207
15208 if (PyUnicode_READY(str) == -1)
15209 return -1;
15210
15211 len = PyUnicode_GET_LENGTH(str);
15212 if ((arg->width == -1 || arg->width <= len)
15213 && (arg->prec == -1 || arg->prec >= len)
15214 && !(arg->flags & (F_SIGN | F_BLANK)))
15215 {
15216 /* Fast path */
15217 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15218 return -1;
15219 return 0;
15220 }
15221
15222 /* Truncate the string for "s", "r" and "a" formats
15223 if the precision is set */
15224 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15225 if (arg->prec >= 0 && len > arg->prec)
15226 len = arg->prec;
15227 }
15228
15229 /* Adjust sign and width */
15230 kind = PyUnicode_KIND(str);
15231 pbuf = PyUnicode_DATA(str);
15232 pindex = 0;
15233 signchar = '\0';
15234 if (arg->sign) {
15235 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15236 if (ch == '-' || ch == '+') {
15237 signchar = ch;
15238 len--;
15239 pindex++;
15240 }
15241 else if (arg->flags & F_SIGN)
15242 signchar = '+';
15243 else if (arg->flags & F_BLANK)
15244 signchar = ' ';
15245 else
15246 arg->sign = 0;
15247 }
15248 if (arg->width < len)
15249 arg->width = len;
15250
15251 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015252 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015253 if (!(arg->flags & F_LJUST)) {
15254 if (arg->sign) {
15255 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015256 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015257 }
15258 else {
15259 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015260 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015261 }
15262 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015263 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15264 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015265 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015266 }
15267
Victor Stinnera47082312012-10-04 02:19:54 +020015268 buflen = arg->width;
15269 if (arg->sign && len == arg->width)
15270 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015271 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015272 return -1;
15273
15274 /* Write the sign if needed */
15275 if (arg->sign) {
15276 if (fill != ' ') {
15277 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15278 writer->pos += 1;
15279 }
15280 if (arg->width > len)
15281 arg->width--;
15282 }
15283
15284 /* Write the numeric prefix for "x", "X" and "o" formats
15285 if the alternate form is used.
15286 For example, write "0x" for the "%#x" format. */
15287 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15288 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15289 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15290 if (fill != ' ') {
15291 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15292 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15293 writer->pos += 2;
15294 pindex += 2;
15295 }
15296 arg->width -= 2;
15297 if (arg->width < 0)
15298 arg->width = 0;
15299 len -= 2;
15300 }
15301
15302 /* Pad left with the fill character if needed */
15303 if (arg->width > len && !(arg->flags & F_LJUST)) {
15304 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015305 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015306 writer->pos += sublen;
15307 arg->width = len;
15308 }
15309
15310 /* If padding with spaces: write sign if needed and/or numeric prefix if
15311 the alternate form is used */
15312 if (fill == ' ') {
15313 if (arg->sign) {
15314 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15315 writer->pos += 1;
15316 }
15317 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15318 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15319 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15320 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15321 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15322 writer->pos += 2;
15323 pindex += 2;
15324 }
15325 }
15326
15327 /* Write characters */
15328 if (len) {
15329 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15330 str, pindex, len);
15331 writer->pos += len;
15332 }
15333
15334 /* Pad right with the fill character if needed */
15335 if (arg->width > len) {
15336 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015337 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015338 writer->pos += sublen;
15339 }
15340 return 0;
15341}
15342
15343/* Helper of PyUnicode_Format(): format one arg.
15344 Return 0 on success, raise an exception and return -1 on error. */
15345static int
15346unicode_format_arg(struct unicode_formatter_t *ctx)
15347{
15348 struct unicode_format_arg_t arg;
15349 PyObject *str;
15350 int ret;
15351
Victor Stinner8dbd4212012-12-04 09:30:24 +010015352 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015353 if (arg.ch == '%') {
15354 ctx->fmtpos++;
15355 ctx->fmtcnt--;
15356 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15357 return -1;
15358 return 0;
15359 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015360 arg.flags = 0;
15361 arg.width = -1;
15362 arg.prec = -1;
15363 arg.sign = 0;
15364 str = NULL;
15365
Victor Stinnera47082312012-10-04 02:19:54 +020015366 ret = unicode_format_arg_parse(ctx, &arg);
15367 if (ret == -1)
15368 return -1;
15369
15370 ret = unicode_format_arg_format(ctx, &arg, &str);
15371 if (ret == -1)
15372 return -1;
15373
15374 if (ret != 1) {
15375 ret = unicode_format_arg_output(ctx, &arg, str);
15376 Py_DECREF(str);
15377 if (ret == -1)
15378 return -1;
15379 }
15380
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015381 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015382 PyErr_SetString(PyExc_TypeError,
15383 "not all arguments converted during string formatting");
15384 return -1;
15385 }
15386 return 0;
15387}
15388
Alexander Belopolsky40018472011-02-26 01:02:56 +000015389PyObject *
15390PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015391{
Victor Stinnera47082312012-10-04 02:19:54 +020015392 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015393
Guido van Rossumd57fd912000-03-10 22:53:23 +000015394 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015395 PyErr_BadInternalCall();
15396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015397 }
Victor Stinnera47082312012-10-04 02:19:54 +020015398
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015399 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015400 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015401
15402 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015403 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15404 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15405 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15406 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015407
Victor Stinner8f674cc2013-04-17 23:02:17 +020015408 _PyUnicodeWriter_Init(&ctx.writer);
15409 ctx.writer.min_length = ctx.fmtcnt + 100;
15410 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015411
Guido van Rossumd57fd912000-03-10 22:53:23 +000015412 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015413 ctx.arglen = PyTuple_Size(args);
15414 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015415 }
15416 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015417 ctx.arglen = -1;
15418 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015419 }
Victor Stinnera47082312012-10-04 02:19:54 +020015420 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015421 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015422 ctx.dict = args;
15423 else
15424 ctx.dict = NULL;
15425 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015426
Victor Stinnera47082312012-10-04 02:19:54 +020015427 while (--ctx.fmtcnt >= 0) {
15428 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015429 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015430
15431 nonfmtpos = ctx.fmtpos++;
15432 while (ctx.fmtcnt >= 0 &&
15433 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15434 ctx.fmtpos++;
15435 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015436 }
Victor Stinnera47082312012-10-04 02:19:54 +020015437 if (ctx.fmtcnt < 0) {
15438 ctx.fmtpos--;
15439 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015440 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015441
Victor Stinnercfc4c132013-04-03 01:48:39 +020015442 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15443 nonfmtpos, ctx.fmtpos) < 0)
15444 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015445 }
15446 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015447 ctx.fmtpos++;
15448 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015449 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015450 }
15451 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015452
Victor Stinnera47082312012-10-04 02:19:54 +020015453 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015454 PyErr_SetString(PyExc_TypeError,
15455 "not all arguments converted during string formatting");
15456 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015457 }
15458
Victor Stinnera47082312012-10-04 02:19:54 +020015459 if (ctx.args_owned) {
15460 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015461 }
Victor Stinnera47082312012-10-04 02:19:54 +020015462 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015463
Benjamin Peterson29060642009-01-31 22:14:21 +000015464 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015465 _PyUnicodeWriter_Dealloc(&ctx.writer);
15466 if (ctx.args_owned) {
15467 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015468 }
15469 return NULL;
15470}
15471
Jeremy Hylton938ace62002-07-17 16:30:39 +000015472static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015473unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15474
15475/*[clinic input]
15476@classmethod
15477str.__new__ as unicode_new
15478
15479 object as x: object = NULL
15480 encoding: str = NULL
15481 errors: str = NULL
15482
15483[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015484
Tim Peters6d6c1a32001-08-02 04:15:00 +000015485static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015486unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15487 const char *errors)
15488/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015489{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015490 PyObject *unicode;
15491 if (x == NULL) {
15492 unicode = unicode_new_empty();
15493 }
15494 else if (encoding == NULL && errors == NULL) {
15495 unicode = PyObject_Str(x);
15496 }
15497 else {
15498 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15499 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015500
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015501 if (unicode != NULL && type != &PyUnicode_Type) {
15502 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15503 }
15504 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015505}
15506
Guido van Rossume023fe02001-08-30 03:12:59 +000015507static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015508unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015509{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015510 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015511 Py_ssize_t length, char_size;
15512 int share_wstr, share_utf8;
15513 unsigned int kind;
15514 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015515
Benjamin Peterson14339b62009-01-31 16:36:08 +000015516 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015517 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015518 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015519 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015520 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015521
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015522 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015523 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015524 return NULL;
15525 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015526 kind = PyUnicode_KIND(unicode);
15527 length = PyUnicode_GET_LENGTH(unicode);
15528
15529 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015530#ifdef Py_DEBUG
15531 _PyUnicode_HASH(self) = -1;
15532#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015533 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015534#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015535 _PyUnicode_STATE(self).interned = 0;
15536 _PyUnicode_STATE(self).kind = kind;
15537 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015538 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015539 _PyUnicode_STATE(self).ready = 1;
15540 _PyUnicode_WSTR(self) = NULL;
15541 _PyUnicode_UTF8_LENGTH(self) = 0;
15542 _PyUnicode_UTF8(self) = NULL;
15543 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015544 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015545
15546 share_utf8 = 0;
15547 share_wstr = 0;
15548 if (kind == PyUnicode_1BYTE_KIND) {
15549 char_size = 1;
15550 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15551 share_utf8 = 1;
15552 }
15553 else if (kind == PyUnicode_2BYTE_KIND) {
15554 char_size = 2;
15555 if (sizeof(wchar_t) == 2)
15556 share_wstr = 1;
15557 }
15558 else {
15559 assert(kind == PyUnicode_4BYTE_KIND);
15560 char_size = 4;
15561 if (sizeof(wchar_t) == 4)
15562 share_wstr = 1;
15563 }
15564
15565 /* Ensure we won't overflow the length. */
15566 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15567 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015568 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015569 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015570 data = PyObject_MALLOC((length + 1) * char_size);
15571 if (data == NULL) {
15572 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015573 goto onError;
15574 }
15575
Victor Stinnerc3c74152011-10-02 20:39:55 +020015576 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015577 if (share_utf8) {
15578 _PyUnicode_UTF8_LENGTH(self) = length;
15579 _PyUnicode_UTF8(self) = data;
15580 }
15581 if (share_wstr) {
15582 _PyUnicode_WSTR_LENGTH(self) = length;
15583 _PyUnicode_WSTR(self) = (wchar_t *)data;
15584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015585
Christian Heimesf051e432016-09-13 20:22:02 +020015586 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015587 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015588 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015589#ifdef Py_DEBUG
15590 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15591#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015592 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015593
15594onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015595 Py_DECREF(self);
15596 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015597}
15598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015599PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015600"str(object='') -> str\n\
15601str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015602\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015603Create a new string object from the given object. If encoding or\n\
15604errors is specified, then the object must expose a data buffer\n\
15605that will be decoded using the given encoding and error handler.\n\
15606Otherwise, returns the result of object.__str__() (if defined)\n\
15607or repr(object).\n\
15608encoding defaults to sys.getdefaultencoding().\n\
15609errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015610
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015611static PyObject *unicode_iter(PyObject *seq);
15612
Guido van Rossumd57fd912000-03-10 22:53:23 +000015613PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015614 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015615 "str", /* tp_name */
15616 sizeof(PyUnicodeObject), /* tp_basicsize */
15617 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015618 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015619 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015620 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015621 0, /* tp_getattr */
15622 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015623 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015624 unicode_repr, /* tp_repr */
15625 &unicode_as_number, /* tp_as_number */
15626 &unicode_as_sequence, /* tp_as_sequence */
15627 &unicode_as_mapping, /* tp_as_mapping */
15628 (hashfunc) unicode_hash, /* tp_hash*/
15629 0, /* tp_call*/
15630 (reprfunc) unicode_str, /* tp_str */
15631 PyObject_GenericGetAttr, /* tp_getattro */
15632 0, /* tp_setattro */
15633 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015634 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015635 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15636 unicode_doc, /* tp_doc */
15637 0, /* tp_traverse */
15638 0, /* tp_clear */
15639 PyUnicode_RichCompare, /* tp_richcompare */
15640 0, /* tp_weaklistoffset */
15641 unicode_iter, /* tp_iter */
15642 0, /* tp_iternext */
15643 unicode_methods, /* tp_methods */
15644 0, /* tp_members */
15645 0, /* tp_getset */
15646 &PyBaseObject_Type, /* tp_base */
15647 0, /* tp_dict */
15648 0, /* tp_descr_get */
15649 0, /* tp_descr_set */
15650 0, /* tp_dictoffset */
15651 0, /* tp_init */
15652 0, /* tp_alloc */
15653 unicode_new, /* tp_new */
15654 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015655};
15656
15657/* Initialize the Unicode implementation */
15658
Victor Stinner331a6a52019-05-27 16:39:22 +020015659PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015660_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015661{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015662 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015663 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015664 0x000A, /* LINE FEED */
15665 0x000D, /* CARRIAGE RETURN */
15666 0x001C, /* FILE SEPARATOR */
15667 0x001D, /* GROUP SEPARATOR */
15668 0x001E, /* RECORD SEPARATOR */
15669 0x0085, /* NEXT LINE */
15670 0x2028, /* LINE SEPARATOR */
15671 0x2029, /* PARAGRAPH SEPARATOR */
15672 };
15673
Victor Stinner91698d82020-06-25 14:07:40 +020015674 struct _Py_unicode_state *state = &tstate->interp->unicode;
15675 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015676 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015677 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015678
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015679 if (_Py_IsMainInterpreter(tstate)) {
15680 /* initialize the linebreak bloom filter */
15681 bloom_linebreak = make_bloom_mask(
15682 PyUnicode_2BYTE_KIND, linebreak,
15683 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015684
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015685 if (PyType_Ready(&PyUnicode_Type) < 0) {
15686 return _PyStatus_ERR("Can't initialize unicode type");
15687 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015688
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015689 if (PyType_Ready(&EncodingMapType) < 0) {
15690 return _PyStatus_ERR("Can't initialize encoding map type");
15691 }
15692 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15693 return _PyStatus_ERR("Can't initialize field name iterator type");
15694 }
15695 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15696 return _PyStatus_ERR("Can't initialize formatter iter type");
15697 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015698 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015699 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015700}
15701
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015702
Walter Dörwald16807132007-05-25 13:52:07 +000015703void
15704PyUnicode_InternInPlace(PyObject **p)
15705{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015706 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015707#ifdef Py_DEBUG
15708 assert(s != NULL);
15709 assert(_PyUnicode_CHECK(s));
15710#else
Victor Stinner607b1022020-05-05 18:50:30 +020015711 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015712 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015713 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015714#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015715
Benjamin Peterson14339b62009-01-31 16:36:08 +000015716 /* If it's a subclass, we don't really know what putting
15717 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015718 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015719 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015720 }
15721
15722 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015723 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015724 }
15725
15726#ifdef INTERNED_STRINGS
Victor Stinner666ecfb2020-07-02 01:19:57 +020015727 if (PyUnicode_READY(s) == -1) {
15728 PyErr_Clear();
15729 return;
15730 }
15731
Benjamin Peterson14339b62009-01-31 16:36:08 +000015732 if (interned == NULL) {
15733 interned = PyDict_New();
15734 if (interned == NULL) {
15735 PyErr_Clear(); /* Don't leave an exception */
15736 return;
15737 }
15738 }
Victor Stinner607b1022020-05-05 18:50:30 +020015739
15740 PyObject *t;
Berker Peksagced8d4c2016-07-25 04:40:39 +030015741 t = PyDict_SetDefault(interned, s, s);
Victor Stinner607b1022020-05-05 18:50:30 +020015742
Berker Peksagced8d4c2016-07-25 04:40:39 +030015743 if (t == NULL) {
15744 PyErr_Clear();
15745 return;
15746 }
Victor Stinner607b1022020-05-05 18:50:30 +020015747
Berker Peksagced8d4c2016-07-25 04:40:39 +030015748 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015749 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015750 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015751 return;
15752 }
Victor Stinner607b1022020-05-05 18:50:30 +020015753
Victor Stinner3549ca32020-07-03 16:59:12 +020015754 /* The two references in interned dict (key and value) are not counted by
15755 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15756 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015757 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015758 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner7f413a52020-09-23 14:05:32 +020015759#else
15760 // PyDict expects that interned strings have their hash
15761 // (PyASCIIObject.hash) already computed.
15762 (void)unicode_hash(s);
Victor Stinner607b1022020-05-05 18:50:30 +020015763#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015764}
15765
15766void
15767PyUnicode_InternImmortal(PyObject **p)
15768{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015769 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15770 "PyUnicode_InternImmortal() is deprecated; "
15771 "use PyUnicode_InternInPlace() instead", 1) < 0)
15772 {
15773 // The function has no return value, the exception cannot
15774 // be reported to the caller, so just log it.
15775 PyErr_WriteUnraisable(NULL);
15776 }
15777
Benjamin Peterson14339b62009-01-31 16:36:08 +000015778 PyUnicode_InternInPlace(p);
15779 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015780 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015781 Py_INCREF(*p);
15782 }
Walter Dörwald16807132007-05-25 13:52:07 +000015783}
15784
15785PyObject *
15786PyUnicode_InternFromString(const char *cp)
15787{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015788 PyObject *s = PyUnicode_FromString(cp);
15789 if (s == NULL)
15790 return NULL;
15791 PyUnicode_InternInPlace(&s);
15792 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015793}
15794
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015795
Victor Stinner666ecfb2020-07-02 01:19:57 +020015796void
15797_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015798{
Victor Stinner666ecfb2020-07-02 01:19:57 +020015799 if (!_Py_IsMainInterpreter(tstate)) {
15800 // interned dict is shared by all interpreters
Benjamin Peterson14339b62009-01-31 16:36:08 +000015801 return;
15802 }
Walter Dörwald16807132007-05-25 13:52:07 +000015803
Victor Stinner666ecfb2020-07-02 01:19:57 +020015804 if (interned == NULL) {
15805 return;
15806 }
15807 assert(PyDict_CheckExact(interned));
15808
15809 PyObject *keys = PyDict_Keys(interned);
15810 if (keys == NULL) {
15811 PyErr_Clear();
15812 return;
15813 }
15814 assert(PyList_CheckExact(keys));
15815
15816 /* Interned unicode strings are not forcibly deallocated; rather, we give
15817 them their stolen references back, and then clear and DECREF the
15818 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015819
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015820 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015821#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015822 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015823
15824 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015825#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015826 for (Py_ssize_t i = 0; i < n; i++) {
15827 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015828 assert(PyUnicode_IS_READY(s));
15829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015830 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015831 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015832 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015833#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015834 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015835#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015836 break;
15837 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015838 // Restore the two references (key and value) ignored
15839 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015840 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015841#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015842 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015843#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015844 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015845 case SSTATE_NOT_INTERNED:
15846 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015847 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015848 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015850 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015851 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015852#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015853 fprintf(stderr,
15854 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15855 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015856#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015857 Py_DECREF(keys);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015858
Benjamin Peterson14339b62009-01-31 16:36:08 +000015859 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015860 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015861}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015862
15863
15864/********************* Unicode Iterator **************************/
15865
15866typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015867 PyObject_HEAD
15868 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015869 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015870} unicodeiterobject;
15871
15872static void
15873unicodeiter_dealloc(unicodeiterobject *it)
15874{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015875 _PyObject_GC_UNTRACK(it);
15876 Py_XDECREF(it->it_seq);
15877 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015878}
15879
15880static int
15881unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15882{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015883 Py_VISIT(it->it_seq);
15884 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015885}
15886
15887static PyObject *
15888unicodeiter_next(unicodeiterobject *it)
15889{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015890 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015891
Benjamin Peterson14339b62009-01-31 16:36:08 +000015892 assert(it != NULL);
15893 seq = it->it_seq;
15894 if (seq == NULL)
15895 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015896 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015898 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15899 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015900 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015901 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15902 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015903 if (item != NULL)
15904 ++it->it_index;
15905 return item;
15906 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015907
Benjamin Peterson14339b62009-01-31 16:36:08 +000015908 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015909 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015910 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015911}
15912
15913static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015914unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015915{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015916 Py_ssize_t len = 0;
15917 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015918 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015919 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015920}
15921
15922PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15923
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015924static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015925unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015926{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015927 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015928 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015929 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015930 it->it_seq, it->it_index);
15931 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015932 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015933 if (u == NULL)
15934 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015935 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015936 }
15937}
15938
15939PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15940
15941static PyObject *
15942unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15943{
15944 Py_ssize_t index = PyLong_AsSsize_t(state);
15945 if (index == -1 && PyErr_Occurred())
15946 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015947 if (it->it_seq != NULL) {
15948 if (index < 0)
15949 index = 0;
15950 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15951 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15952 it->it_index = index;
15953 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015954 Py_RETURN_NONE;
15955}
15956
15957PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15958
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015959static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015960 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015961 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015962 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15963 reduce_doc},
15964 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15965 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015966 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015967};
15968
15969PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015970 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15971 "str_iterator", /* tp_name */
15972 sizeof(unicodeiterobject), /* tp_basicsize */
15973 0, /* tp_itemsize */
15974 /* methods */
15975 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015976 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015977 0, /* tp_getattr */
15978 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015979 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015980 0, /* tp_repr */
15981 0, /* tp_as_number */
15982 0, /* tp_as_sequence */
15983 0, /* tp_as_mapping */
15984 0, /* tp_hash */
15985 0, /* tp_call */
15986 0, /* tp_str */
15987 PyObject_GenericGetAttr, /* tp_getattro */
15988 0, /* tp_setattro */
15989 0, /* tp_as_buffer */
15990 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15991 0, /* tp_doc */
15992 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15993 0, /* tp_clear */
15994 0, /* tp_richcompare */
15995 0, /* tp_weaklistoffset */
15996 PyObject_SelfIter, /* tp_iter */
15997 (iternextfunc)unicodeiter_next, /* tp_iternext */
15998 unicodeiter_methods, /* tp_methods */
15999 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016000};
16001
16002static PyObject *
16003unicode_iter(PyObject *seq)
16004{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016005 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016006
Benjamin Peterson14339b62009-01-31 16:36:08 +000016007 if (!PyUnicode_Check(seq)) {
16008 PyErr_BadInternalCall();
16009 return NULL;
16010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016011 if (PyUnicode_READY(seq) == -1)
16012 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016013 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16014 if (it == NULL)
16015 return NULL;
16016 it->it_index = 0;
16017 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016018 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016019 _PyObject_GC_TRACK(it);
16020 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016021}
16022
Victor Stinner709d23d2019-05-02 14:56:30 -040016023static int
16024encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016025{
Victor Stinner709d23d2019-05-02 14:56:30 -040016026 int res;
16027 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16028 if (res == -2) {
16029 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16030 return -1;
16031 }
16032 if (res < 0) {
16033 PyErr_NoMemory();
16034 return -1;
16035 }
16036 return 0;
16037}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016038
Victor Stinner709d23d2019-05-02 14:56:30 -040016039
16040static int
16041config_get_codec_name(wchar_t **config_encoding)
16042{
16043 char *encoding;
16044 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16045 return -1;
16046 }
16047
16048 PyObject *name_obj = NULL;
16049 PyObject *codec = _PyCodec_Lookup(encoding);
16050 PyMem_RawFree(encoding);
16051
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016052 if (!codec)
16053 goto error;
16054
16055 name_obj = PyObject_GetAttrString(codec, "name");
16056 Py_CLEAR(codec);
16057 if (!name_obj) {
16058 goto error;
16059 }
16060
Victor Stinner709d23d2019-05-02 14:56:30 -040016061 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16062 Py_DECREF(name_obj);
16063 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016064 goto error;
16065 }
16066
Victor Stinner709d23d2019-05-02 14:56:30 -040016067 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16068 if (raw_wname == NULL) {
16069 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016070 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016071 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016072 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016073
16074 PyMem_RawFree(*config_encoding);
16075 *config_encoding = raw_wname;
16076
16077 PyMem_Free(wname);
16078 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016079
16080error:
16081 Py_XDECREF(codec);
16082 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016083 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016084}
16085
16086
Victor Stinner331a6a52019-05-27 16:39:22 +020016087static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016088init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016089{
Victor Stinner709d23d2019-05-02 14:56:30 -040016090 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016091 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016092 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016093 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016094 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016095 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016096 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016097}
16098
16099
Victor Stinner709d23d2019-05-02 14:56:30 -040016100static int
16101init_fs_codec(PyInterpreterState *interp)
16102{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016103 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016104
16105 _Py_error_handler error_handler;
16106 error_handler = get_error_handler_wide(config->filesystem_errors);
16107 if (error_handler == _Py_ERROR_UNKNOWN) {
16108 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16109 return -1;
16110 }
16111
16112 char *encoding, *errors;
16113 if (encode_wstr_utf8(config->filesystem_encoding,
16114 &encoding,
16115 "filesystem_encoding") < 0) {
16116 return -1;
16117 }
16118
16119 if (encode_wstr_utf8(config->filesystem_errors,
16120 &errors,
16121 "filesystem_errors") < 0) {
16122 PyMem_RawFree(encoding);
16123 return -1;
16124 }
16125
Victor Stinner3d17c042020-05-14 01:48:38 +020016126 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16127 PyMem_RawFree(fs_codec->encoding);
16128 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016129 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016130 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16131 PyMem_RawFree(fs_codec->errors);
16132 fs_codec->errors = errors;
16133 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016134
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016135#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016136 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016137#endif
16138
Victor Stinner709d23d2019-05-02 14:56:30 -040016139 /* At this point, PyUnicode_EncodeFSDefault() and
16140 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16141 the C implementation of the filesystem encoding. */
16142
16143 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16144 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016145 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16146 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016147 PyErr_NoMemory();
16148 return -1;
16149 }
16150 return 0;
16151}
16152
16153
Victor Stinner331a6a52019-05-27 16:39:22 +020016154static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016155init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016156{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016157 PyInterpreterState *interp = tstate->interp;
16158
Victor Stinner709d23d2019-05-02 14:56:30 -040016159 /* Update the filesystem encoding to the normalized Python codec name.
16160 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16161 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016162 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016163 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016164 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016165 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016166 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016167 }
16168
Victor Stinner709d23d2019-05-02 14:56:30 -040016169 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016170 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016171 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016172 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016173}
16174
16175
Victor Stinner331a6a52019-05-27 16:39:22 +020016176PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016177_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016178{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016179 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016180 if (_PyStatus_EXCEPTION(status)) {
16181 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016182 }
16183
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016184 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016185}
16186
16187
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016188static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016189_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016190{
Victor Stinner3d17c042020-05-14 01:48:38 +020016191 PyMem_RawFree(fs_codec->encoding);
16192 fs_codec->encoding = NULL;
16193 fs_codec->utf8 = 0;
16194 PyMem_RawFree(fs_codec->errors);
16195 fs_codec->errors = NULL;
16196 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016197}
16198
16199
Victor Stinner709d23d2019-05-02 14:56:30 -040016200#ifdef MS_WINDOWS
16201int
16202_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16203{
Victor Stinner81a7be32020-04-14 15:14:01 +020016204 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016205 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016206
16207 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16208 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16209 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16210 if (encoding == NULL || errors == NULL) {
16211 PyMem_RawFree(encoding);
16212 PyMem_RawFree(errors);
16213 PyErr_NoMemory();
16214 return -1;
16215 }
16216
16217 PyMem_RawFree(config->filesystem_encoding);
16218 config->filesystem_encoding = encoding;
16219 PyMem_RawFree(config->filesystem_errors);
16220 config->filesystem_errors = errors;
16221
16222 return init_fs_codec(interp);
16223}
16224#endif
16225
16226
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016227void
Victor Stinner3d483342019-11-22 12:27:50 +010016228_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016229{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016230 // _PyUnicode_ClearInterned() must be called before
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016231
Victor Stinner666ecfb2020-07-02 01:19:57 +020016232 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016233
Victor Stinner91698d82020-06-25 14:07:40 +020016234 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016235
Victor Stinner2f9ada92020-06-24 02:22:21 +020016236 for (Py_ssize_t i = 0; i < 256; i++) {
16237 Py_CLEAR(state->latin1[i]);
16238 }
16239
Victor Stinner666ecfb2020-07-02 01:19:57 +020016240 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016241 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016242 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016243
Victor Stinner3d17c042020-05-14 01:48:38 +020016244 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016245}
16246
16247
Georg Brandl66c221e2010-10-14 07:04:07 +000016248/* A _string module, to export formatter_parser and formatter_field_name_split
16249 to the string.Formatter class implemented in Python. */
16250
16251static PyMethodDef _string_methods[] = {
16252 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16253 METH_O, PyDoc_STR("split the argument as a field name")},
16254 {"formatter_parser", (PyCFunction) formatter_parser,
16255 METH_O, PyDoc_STR("parse the argument as a format string")},
16256 {NULL, NULL}
16257};
16258
16259static struct PyModuleDef _string_module = {
16260 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016261 .m_name = "_string",
16262 .m_doc = PyDoc_STR("string helper module"),
16263 .m_size = 0,
16264 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016265};
16266
16267PyMODINIT_FUNC
16268PyInit__string(void)
16269{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016270 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016271}
16272
16273
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016274#ifdef __cplusplus
16275}
16276#endif