blob: 648dd15ca09f58801aee00732267c03402988197 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner91698d82020-06-25 14:07:40 +020044#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
Victor Stinnere5014be2020-04-14 17:52:15 +020046#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinner91698d82020-06-25 14:07:40 +020047#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
Victor Stinnere5014be2020-04-14 17:52:15 +020050#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner91698d82020-06-25 14:07:40 +020051#include "ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinner666ecfb2020-07-02 01:19:57 +020058/* Uncomment to display statistics on interned strings at exit
59 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010060/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 }
844 return str;
845}
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* --- Bloom Filters ----------------------------------------------------- */
848
849/* stuff to implement simple "bloom filters" for Unicode characters.
850 to keep things simple, we use a single bitmask, using the least 5
851 bits from each unicode characters as the bit index. */
852
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200853/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854
Antoine Pitrouf068f942010-01-13 14:19:12 +0000855#if LONG_BIT >= 128
856#define BLOOM_WIDTH 128
857#elif LONG_BIT >= 64
858#define BLOOM_WIDTH 64
859#elif LONG_BIT >= 32
860#define BLOOM_WIDTH 32
861#else
862#error "LONG_BIT is smaller than 32"
863#endif
864
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865#define BLOOM_MASK unsigned long
866
Serhiy Storchaka05997252013-01-26 12:14:02 +0200867static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870
Benjamin Peterson29060642009-01-31 22:14:21 +0000871#define BLOOM_LINEBREAK(ch) \
872 ((ch) < 128U ? ascii_linebreak[(ch)] : \
873 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700875static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300876make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877{
Victor Stinnera85af502013-04-09 21:53:54 +0200878#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
879 do { \
880 TYPE *data = (TYPE *)PTR; \
881 TYPE *end = data + LEN; \
882 Py_UCS4 ch; \
883 for (; data != end; data++) { \
884 ch = *data; \
885 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
886 } \
887 break; \
888 } while (0)
889
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 /* calculate simple bloom-style bitmask for a given unicode string */
891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
894 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200895 switch (kind) {
896 case PyUnicode_1BYTE_KIND:
897 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
898 break;
899 case PyUnicode_2BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
901 break;
902 case PyUnicode_4BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
904 break;
905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700906 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200909
910#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911}
912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300913static int
914ensure_unicode(PyObject *obj)
915{
916 if (!PyUnicode_Check(obj)) {
917 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200918 "must be str, not %.100s",
919 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920 return -1;
921 }
922 return PyUnicode_READY(obj);
923}
924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925/* Compilation of templated routines */
926
Victor Stinner90ed8a62020-06-24 00:34:07 +0200927#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/asciilib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs1lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
948#include "stringlib/ucs2lib.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/partition.h"
951#include "stringlib/split.h"
952#include "stringlib/count.h"
953#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300954#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200956#include "stringlib/undef.h"
957
958#include "stringlib/ucs4lib.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/partition.h"
961#include "stringlib/split.h"
962#include "stringlib/count.h"
963#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300964#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200965#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200966#include "stringlib/undef.h"
967
Inada Naoki2c4928d2020-06-17 20:09:44 +0900968_Py_COMP_DIAG_PUSH
969_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970#include "stringlib/unicodedefs.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/count.h"
973#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100974#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900975_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200977#undef STRINGLIB_GET_EMPTY
978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001227 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001228 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 }
1230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001231 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001232 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001233 return (PyUnicodeObject *)PyErr_NoMemory();
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 if (length < 0) {
1236 PyErr_SetString(PyExc_SystemError,
1237 "Negative size passed to _PyUnicode_New");
1238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1242 if (unicode == NULL)
1243 return NULL;
1244 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001245
1246 _PyUnicode_WSTR_LENGTH(unicode) = length;
1247 _PyUnicode_HASH(unicode) = -1;
1248 _PyUnicode_STATE(unicode).interned = 0;
1249 _PyUnicode_STATE(unicode).kind = 0;
1250 _PyUnicode_STATE(unicode).compact = 0;
1251 _PyUnicode_STATE(unicode).ready = 0;
1252 _PyUnicode_STATE(unicode).ascii = 0;
1253 _PyUnicode_DATA_ANY(unicode) = NULL;
1254 _PyUnicode_LENGTH(unicode) = 0;
1255 _PyUnicode_UTF8(unicode) = NULL;
1256 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1259 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001260 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001261 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001262 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264
Jeremy Hyltond8082792003-09-16 19:41:39 +00001265 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001266 * the caller fails before initializing str -- unicode_resize()
1267 * reads str[0], and the Keep-Alive optimization can keep memory
1268 * allocated for str alive across a call to unicode_dealloc(unicode).
1269 * We don't want unicode_resize to read uninitialized memory in
1270 * that case.
1271 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode)[0] = 0;
1273 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001274
Victor Stinner7931d9a2011-11-04 00:22:48 +01001275 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 return unicode;
1277}
1278
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279static const char*
1280unicode_kind_name(PyObject *unicode)
1281{
Victor Stinner42dfd712011-10-03 14:41:45 +02001282 /* don't check consistency: unicode_kind_name() is called from
1283 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 if (!PyUnicode_IS_COMPACT(unicode))
1285 {
1286 if (!PyUnicode_IS_READY(unicode))
1287 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 {
1290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "legacy ascii";
1293 else
1294 return "legacy latin1";
1295 case PyUnicode_2BYTE_KIND:
1296 return "legacy UCS2";
1297 case PyUnicode_4BYTE_KIND:
1298 return "legacy UCS4";
1299 default:
1300 return "<legacy invalid kind>";
1301 }
1302 }
1303 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001304 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001306 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 return "ascii";
1308 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 default:
1315 return "<invalid compact kind>";
1316 }
1317}
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001321const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001322 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return _PyUnicode_COMPACT_DATA(unicode);
1329}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001332 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1334 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1335 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1336 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1337 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1338 return PyUnicode_DATA(unicode);
1339}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001340
1341void
1342_PyUnicode_Dump(PyObject *op)
1343{
1344 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1346 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001347 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001350 {
1351 if (ascii->state.ascii)
1352 data = (ascii + 1);
1353 else
1354 data = (compact + 1);
1355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 else
1357 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001358 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->wstr == data)
1361 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001362 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera3b334d2011-10-03 13:53:37 +02001364 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001365 printf(" (%zu), ", compact->wstr_length);
1366 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 }
1369 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001370 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
1374
Victor Stinner91698d82020-06-25 14:07:40 +02001375static int
1376unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1377{
1378 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1379 // optimized to always use state->empty_string without having to check if
1380 // it is NULL or not.
1381 PyObject *empty = PyUnicode_New(1, 0);
1382 if (empty == NULL) {
1383 return -1;
1384 }
1385 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1386 _PyUnicode_LENGTH(empty) = 0;
1387 assert(_PyUnicode_CheckConsistency(empty, 1));
1388
1389 assert(state->empty_string == NULL);
1390 state->empty_string = empty;
1391 return 0;
1392}
1393
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyObject *
1396PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1397{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001398 /* Optimization for empty strings */
1399 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001400 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 }
1402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *obj;
1404 PyCompactUnicodeObject *unicode;
1405 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001406 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001407 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 Py_ssize_t char_size;
1409 Py_ssize_t struct_size;
1410
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 is_ascii = 0;
1412 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 struct_size = sizeof(PyCompactUnicodeObject);
1414 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001415 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 char_size = 1;
1417 is_ascii = 1;
1418 struct_size = sizeof(PyASCIIObject);
1419 }
1420 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001421 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 char_size = 1;
1423 }
1424 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 2;
1427 if (sizeof(wchar_t) == 2)
1428 is_sharing = 1;
1429 }
1430 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001431 if (maxchar > MAX_UNICODE) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "invalid maximum character passed to PyUnicode_New");
1434 return NULL;
1435 }
Victor Stinner8f825062012-04-27 13:55:39 +02001436 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 char_size = 4;
1438 if (sizeof(wchar_t) == 4)
1439 is_sharing = 1;
1440 }
1441
1442 /* Ensure we won't overflow the size. */
1443 if (size < 0) {
1444 PyErr_SetString(PyExc_SystemError,
1445 "Negative size passed to PyUnicode_New");
1446 return NULL;
1447 }
1448 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1449 return PyErr_NoMemory();
1450
1451 /* Duplicated allocation code from _PyObject_New() instead of a call to
1452 * PyObject_New() so we are able to allocate space for the object and
1453 * it's data buffer.
1454 */
1455 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001456 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001458 }
1459 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 unicode = (PyCompactUnicodeObject *)obj;
1462 if (is_ascii)
1463 data = ((PyASCIIObject*)obj) + 1;
1464 else
1465 data = unicode + 1;
1466 _PyUnicode_LENGTH(unicode) = size;
1467 _PyUnicode_HASH(unicode) = -1;
1468 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 _PyUnicode_STATE(unicode).compact = 1;
1471 _PyUnicode_STATE(unicode).ready = 1;
1472 _PyUnicode_STATE(unicode).ascii = is_ascii;
1473 if (is_ascii) {
1474 ((char*)data)[size] = 0;
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 }
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001482 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else {
1485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001489 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 ((Py_UCS4*)data)[size] = 0;
1491 if (is_sharing) {
1492 _PyUnicode_WSTR_LENGTH(unicode) = size;
1493 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1494 }
1495 else {
1496 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 }
1499 }
Victor Stinner8f825062012-04-27 13:55:39 +02001500#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001501 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001502#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return obj;
1505}
1506
1507#if SIZEOF_WCHAR_T == 2
1508/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1509 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001510 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 This function assumes that unicode can hold one more code point than wstr
1513 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001514static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
1518 const wchar_t *iter;
1519 Py_UCS4 *ucs4_out;
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 assert(unicode != NULL);
1522 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1524 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1525
1526 for (iter = begin; iter < end; ) {
1527 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1528 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001529 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1530 && (iter+1) < end
1531 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 {
Victor Stinner551ac952011-11-29 22:58:13 +01001533 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 iter += 2;
1535 }
1536 else {
1537 *ucs4_out++ = *iter;
1538 iter++;
1539 }
1540 }
1541 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
1543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544}
1545#endif
1546
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547static int
Victor Stinner488fa492011-12-12 00:01:39 +01001548unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001549{
Victor Stinner488fa492011-12-12 00:01:39 +01001550 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001551 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001552 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553 return -1;
1554 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001555 return 0;
1556}
1557
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558static int
1559_copy_characters(PyObject *to, Py_ssize_t to_start,
1560 PyObject *from, Py_ssize_t from_start,
1561 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001563 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001564 const void *from_data;
1565 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinneree4544c2012-05-09 22:24:08 +02001567 assert(0 <= how_many);
1568 assert(0 <= from_start);
1569 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerd3f08822012-05-29 12:57:52 +02001574 assert(PyUnicode_Check(to));
1575 assert(PyUnicode_IS_READY(to));
1576 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1577
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 if (how_many == 0)
1579 return 0;
1580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585
Victor Stinnerf1852262012-06-16 16:38:26 +02001586#ifdef Py_DEBUG
1587 if (!check_maxchar
1588 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1589 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 Py_UCS4 ch;
1592 Py_ssize_t i;
1593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1595 assert(ch <= to_maxchar);
1596 }
1597 }
1598#endif
1599
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001600 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (check_maxchar
1602 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1603 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 /* Writing Latin-1 characters into an ASCII string requires to
1605 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 Py_UCS4 max_char;
1607 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001608 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 if (max_char >= 128)
1610 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001611 }
Christian Heimesf051e432016-09-13 20:22:02 +02001612 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001614 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001616 else if (from_kind == PyUnicode_1BYTE_KIND
1617 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS1, Py_UCS2,
1621 PyUnicode_1BYTE_DATA(from) + from_start,
1622 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_2BYTE_DATA(to) + to_start
1624 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001625 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001626 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 && to_kind == PyUnicode_4BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS1, Py_UCS4,
1631 PyUnicode_1BYTE_DATA(from) + from_start,
1632 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_4BYTE_DATA(to) + to_start
1634 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001635 }
1636 else if (from_kind == PyUnicode_2BYTE_KIND
1637 && to_kind == PyUnicode_4BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS2, Py_UCS4,
1641 PyUnicode_2BYTE_DATA(from) + from_start,
1642 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_4BYTE_DATA(to) + to_start
1644 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001645 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1648
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 if (!check_maxchar) {
1650 if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_1BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS1,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_1BYTE_DATA(to) + to_start
1658 );
1659 }
1660 else if (from_kind == PyUnicode_4BYTE_KIND
1661 && to_kind == PyUnicode_1BYTE_KIND)
1662 {
1663 _PyUnicode_CONVERT_BYTES(
1664 Py_UCS4, Py_UCS1,
1665 PyUnicode_4BYTE_DATA(from) + from_start,
1666 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1667 PyUnicode_1BYTE_DATA(to) + to_start
1668 );
1669 }
1670 else if (from_kind == PyUnicode_4BYTE_KIND
1671 && to_kind == PyUnicode_2BYTE_KIND)
1672 {
1673 _PyUnicode_CONVERT_BYTES(
1674 Py_UCS4, Py_UCS2,
1675 PyUnicode_4BYTE_DATA(from) + from_start,
1676 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1677 PyUnicode_2BYTE_DATA(to) + to_start
1678 );
1679 }
1680 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001681 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001682 }
1683 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001684 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 Py_ssize_t i;
1688
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 for (i=0; i < how_many; i++) {
1690 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001691 if (ch > to_maxchar)
1692 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1694 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001695 }
1696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 return 0;
1698}
1699
Victor Stinnerd3f08822012-05-29 12:57:52 +02001700void
1701_PyUnicode_FastCopyCharacters(
1702 PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704{
1705 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1706}
1707
1708Py_ssize_t
1709PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1710 PyObject *from, Py_ssize_t from_start,
1711 Py_ssize_t how_many)
1712{
1713 int err;
1714
1715 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1716 PyErr_BadInternalCall();
1717 return -1;
1718 }
1719
Benjamin Petersonbac79492012-01-14 13:34:47 -05001720 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001722 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001723 return -1;
1724
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001726 PyErr_SetString(PyExc_IndexError, "string index out of range");
1727 return -1;
1728 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if (how_many < 0) {
1734 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1735 return -1;
1736 }
1737 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1739 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001740 "Cannot write %zi characters at %zi "
1741 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 how_many, to_start, PyUnicode_GET_LENGTH(to));
1743 return -1;
1744 }
1745
1746 if (how_many == 0)
1747 return 0;
1748
Victor Stinner488fa492011-12-12 00:01:39 +01001749 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001750 return -1;
1751
1752 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1753 if (err) {
1754 PyErr_Format(PyExc_SystemError,
1755 "Cannot copy %s characters "
1756 "into a string of %s characters",
1757 unicode_kind_name(from),
1758 unicode_kind_name(to));
1759 return -1;
1760 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001761 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Victor Stinner17222162011-09-28 22:15:37 +02001764/* Find the maximum code point and count the number of surrogate pairs so a
1765 correct string length can be computed before converting a string to UCS4.
1766 This function counts single surrogates as a character and not as a pair.
1767
1768 Return 0 on success, or -1 on error. */
1769static int
1770find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1771 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772{
1773 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 *num_surrogates = 0;
1778 *maxchar = 0;
1779
1780 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001782 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1783 && (iter+1) < end
1784 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1785 {
1786 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1787 ++(*num_surrogates);
1788 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 {
1793 ch = *iter;
1794 iter++;
1795 }
1796 if (ch > *maxchar) {
1797 *maxchar = ch;
1798 if (*maxchar > MAX_UNICODE) {
1799 PyErr_Format(PyExc_ValueError,
1800 "character U+%x is not in range [U+0000; U+10ffff]",
1801 ch);
1802 return -1;
1803 }
1804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 return 0;
1807}
1808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809int
1810_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811{
1812 wchar_t *end;
1813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815#if SIZEOF_WCHAR_T == 2
1816 Py_ssize_t length_wo_surrogates;
1817#endif
1818
Georg Brandl7597add2011-10-05 16:36:47 +02001819 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 strings were created using _PyObject_New() and where no canonical
1821 representation (the str field) has been set yet aka strings
1822 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001823 assert(_PyUnicode_CHECK(unicode));
1824 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001827 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001828 /* Actually, it should neither be interned nor be anything else: */
1829 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001832 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
1836 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1838 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 PyErr_NoMemory();
1840 return -1;
1841 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001842 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 _PyUnicode_WSTR(unicode), end,
1844 PyUnicode_1BYTE_DATA(unicode));
1845 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1847 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1848 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001849 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001850 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001851 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 }
1853 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861 }
1862 /* In this case we might have to convert down from 4-byte native
1863 wchar_t to 2-byte unicode. */
1864 else if (maxchar < 65536) {
1865 assert(num_surrogates == 0 &&
1866 "FindMaxCharAndNumSurrogatePairs() messed up");
1867
Victor Stinner506f5922011-09-28 22:34:18 +02001868#if SIZEOF_WCHAR_T == 2
1869 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876#else
1877 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001878 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001879 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001881 PyErr_NoMemory();
1882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinner506f5922011-09-28 22:34:18 +02001884 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1885 _PyUnicode_WSTR(unicode), end,
1886 PyUnicode_2BYTE_DATA(unicode));
1887 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1888 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1889 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001890 _PyUnicode_UTF8(unicode) = NULL;
1891 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001892 PyObject_FREE(_PyUnicode_WSTR(unicode));
1893 _PyUnicode_WSTR(unicode) = NULL;
1894 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
1897 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1898 else {
1899#if SIZEOF_WCHAR_T == 2
1900 /* in case the native representation is 2-bytes, we need to allocate a
1901 new normalized 4-byte version. */
1902 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001903 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1904 PyErr_NoMemory();
1905 return -1;
1906 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1908 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 PyErr_NoMemory();
1910 return -1;
1911 }
1912 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001916 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001918 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 PyObject_FREE(_PyUnicode_WSTR(unicode));
1920 _PyUnicode_WSTR(unicode) = NULL;
1921 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1922#else
1923 assert(num_surrogates == 0);
1924
Victor Stinnerc3c74152011-10-02 20:39:55 +02001925 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001927 _PyUnicode_UTF8(unicode) = NULL;
1928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1930#endif
1931 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1932 }
1933 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return 0;
1936}
1937
Alexander Belopolsky40018472011-02-26 01:02:56 +00001938static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001939unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940{
Walter Dörwald16807132007-05-25 13:52:07 +00001941 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 case SSTATE_NOT_INTERNED:
1943 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_INTERNED_MORTAL:
Victor Stinner607b1022020-05-05 18:50:30 +02001946#ifdef INTERNED_STRINGS
Victor Stinner3549ca32020-07-03 16:59:12 +02001947 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1948 references (key and value) which were ignored by
1949 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1950 to prevent calling unicode_dealloc() again. Adjust refcnt after
1951 PyDict_DelItem(). */
1952 assert(Py_REFCNT(unicode) == 0);
1953 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001954 if (PyDict_DelItem(interned, unicode) != 0) {
1955 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1956 NULL);
1957 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001958 assert(Py_REFCNT(unicode) == 1);
1959 Py_SET_REFCNT(unicode, 0);
Victor Stinner607b1022020-05-05 18:50:30 +02001960#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001961 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1965 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001966
Benjamin Peterson29060642009-01-31 22:14:21 +00001967 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001968 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001969 }
1970
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001971 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001973 }
1974 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001975 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001976 }
1977 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001978 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001981 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001984#ifdef Py_DEBUG
1985static int
1986unicode_is_singleton(PyObject *unicode)
1987{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001988 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001989 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001991 }
Victor Stinner607b1022020-05-05 18:50:30 +02001992 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001993 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1994 {
1995 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001996 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001997 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001998 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001999 }
2000 return 0;
2001}
2002#endif
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004static int
Victor Stinner488fa492011-12-12 00:01:39 +01002005unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002006{
Victor Stinner488fa492011-12-12 00:01:39 +01002007 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 if (Py_REFCNT(unicode) != 1)
2009 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002010 if (_PyUnicode_HASH(unicode) != -1)
2011 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012 if (PyUnicode_CHECK_INTERNED(unicode))
2013 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002014 if (!PyUnicode_CheckExact(unicode))
2015 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002016#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002017 /* singleton refcount is greater than 1 */
2018 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002019#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002020 return 1;
2021}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002022
Victor Stinnerfe226c02011-10-03 03:52:20 +02002023static int
2024unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2025{
2026 PyObject *unicode;
2027 Py_ssize_t old_length;
2028
2029 assert(p_unicode != NULL);
2030 unicode = *p_unicode;
2031
2032 assert(unicode != NULL);
2033 assert(PyUnicode_Check(unicode));
2034 assert(0 <= length);
2035
Victor Stinner910337b2011-10-03 03:20:16 +02002036 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002037 old_length = PyUnicode_WSTR_LENGTH(unicode);
2038 else
2039 old_length = PyUnicode_GET_LENGTH(unicode);
2040 if (old_length == length)
2041 return 0;
2042
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002043 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002044 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002045 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002046 return 0;
2047 }
2048
Victor Stinner488fa492011-12-12 00:01:39 +01002049 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 PyObject *copy = resize_copy(unicode, length);
2051 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002052 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002053 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002055 }
2056
Victor Stinnerfe226c02011-10-03 03:52:20 +02002057 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002058 PyObject *new_unicode = resize_compact(unicode, length);
2059 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002060 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002061 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002062 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002063 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002064 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002068PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002069{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 PyObject *unicode;
2071 if (p_unicode == NULL) {
2072 PyErr_BadInternalCall();
2073 return -1;
2074 }
2075 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002076 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002077 {
2078 PyErr_BadInternalCall();
2079 return -1;
2080 }
2081 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002082}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002083
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002084/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002085
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002086 WARNING: The function doesn't copy the terminating null character and
2087 doesn't check the maximum character (may write a latin1 character in an
2088 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002089static void
2090unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2091 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002092{
2093 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002094 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002095 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002096
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002097 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002098 switch (kind) {
2099 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002100#ifdef Py_DEBUG
2101 if (PyUnicode_IS_ASCII(unicode)) {
2102 Py_UCS4 maxchar = ucs1lib_find_max_char(
2103 (const Py_UCS1*)str,
2104 (const Py_UCS1*)str + len);
2105 assert(maxchar < 128);
2106 }
2107#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002108 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002109 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002110 }
2111 case PyUnicode_2BYTE_KIND: {
2112 Py_UCS2 *start = (Py_UCS2 *)data + index;
2113 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002114
Victor Stinner184252a2012-06-16 02:57:41 +02002115 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002116 *ucs2 = (Py_UCS2)*str;
2117
2118 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002119 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002120 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002121 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002122 Py_UCS4 *start = (Py_UCS4 *)data + index;
2123 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002124
Victor Stinner184252a2012-06-16 02:57:41 +02002125 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 *ucs4 = (Py_UCS4)*str;
2127
2128 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002129 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002130 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002131 default:
2132 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002133 }
2134}
2135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002137get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002139 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002140
Victor Stinner2f9ada92020-06-24 02:22:21 +02002141 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002142 if (unicode) {
2143 Py_INCREF(unicode);
2144 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 }
Victor Stinner607b1022020-05-05 18:50:30 +02002146
2147 unicode = PyUnicode_New(1, ch);
2148 if (!unicode) {
2149 return NULL;
2150 }
2151
2152 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2153 assert(_PyUnicode_CheckConsistency(unicode, 1));
2154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002156 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002157 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158}
2159
Victor Stinner985a82a2014-01-03 12:53:47 +01002160static PyObject*
2161unicode_char(Py_UCS4 ch)
2162{
2163 PyObject *unicode;
2164
2165 assert(ch <= MAX_UNICODE);
2166
Victor Stinner2f9ada92020-06-24 02:22:21 +02002167 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002168 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002169 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002170
Victor Stinner985a82a2014-01-03 12:53:47 +01002171 unicode = PyUnicode_New(1, ch);
2172 if (unicode == NULL)
2173 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002174
2175 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2176 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002177 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002178 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002179 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2180 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2181 }
2182 assert(_PyUnicode_CheckConsistency(unicode, 1));
2183 return unicode;
2184}
2185
Alexander Belopolsky40018472011-02-26 01:02:56 +00002186PyObject *
2187PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002189 if (u == NULL) {
2190 if (size > 0) {
2191 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2192 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2193 "use PyUnicode_New() instead", 1) < 0) {
2194 return NULL;
2195 }
2196 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002197 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002198 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002199
2200 if (size < 0) {
2201 PyErr_BadInternalCall();
2202 return NULL;
2203 }
2204
2205 return PyUnicode_FromWideChar(u, size);
2206}
2207
2208PyObject *
2209PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2210{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002211 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 Py_UCS4 maxchar = 0;
2213 Py_ssize_t num_surrogates;
2214
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002215 if (u == NULL && size != 0) {
2216 PyErr_BadInternalCall();
2217 return NULL;
2218 }
2219
2220 if (size == -1) {
2221 size = wcslen(u);
2222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002224 /* If the Unicode data is known at construction time, we can apply
2225 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002228 if (size == 0)
2229 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 /* Single character Unicode objects in the Latin-1 range are
2232 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002233 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return get_latin1_char((unsigned char)*u);
2235
2236 /* If not empty and not single character, copy the Unicode data
2237 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002238 if (find_maxchar_surrogates(u, u + size,
2239 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 return NULL;
2241
Victor Stinner8faf8212011-12-08 22:14:11 +01002242 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 if (!unicode)
2244 return NULL;
2245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 switch (PyUnicode_KIND(unicode)) {
2247 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002248 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2250 break;
2251 case PyUnicode_2BYTE_KIND:
2252#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002253 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002255 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2257#endif
2258 break;
2259 case PyUnicode_4BYTE_KIND:
2260#if SIZEOF_WCHAR_T == 2
2261 /* This is the only case which has to process surrogates, thus
2262 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002263 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264#else
2265 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002266 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267#endif
2268 break;
2269 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002270 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002273 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274}
2275
Alexander Belopolsky40018472011-02-26 01:02:56 +00002276PyObject *
2277PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002278{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002279 if (size < 0) {
2280 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 return NULL;
2283 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002284 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002285 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002286 }
2287 else {
2288 if (size > 0) {
2289 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2290 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2291 "use PyUnicode_New() instead", 1) < 0) {
2292 return NULL;
2293 }
2294 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002295 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002296 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002297}
2298
Alexander Belopolsky40018472011-02-26 01:02:56 +00002299PyObject *
2300PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002301{
2302 size_t size = strlen(u);
2303 if (size > PY_SSIZE_T_MAX) {
2304 PyErr_SetString(PyExc_OverflowError, "input too long");
2305 return NULL;
2306 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002307 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308}
2309
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002310PyObject *
2311_PyUnicode_FromId(_Py_Identifier *id)
2312{
Victor Stinner297257f2020-06-02 14:39:45 +02002313 if (id->object) {
2314 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002315 }
Victor Stinner297257f2020-06-02 14:39:45 +02002316
2317 PyObject *obj;
2318 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2319 strlen(id->string),
2320 NULL, NULL);
2321 if (!obj) {
2322 return NULL;
2323 }
2324 PyUnicode_InternInPlace(&obj);
2325
2326 assert(!id->next);
2327 id->object = obj;
2328 id->next = static_strings;
2329 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002330 return id->object;
2331}
2332
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002333static void
2334unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002335{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002336 _Py_Identifier *tmp, *s = static_strings;
2337 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002338 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002339 tmp = s->next;
2340 s->next = NULL;
2341 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002342 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002343 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002344}
2345
Benjamin Peterson0df54292012-03-26 14:50:32 -04002346/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002347
Victor Stinnerd3f08822012-05-29 12:57:52 +02002348PyObject*
2349_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002350{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002351 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002352 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002353 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002354#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002355 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002356#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002357 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002358 }
Victor Stinner785938e2011-12-11 20:09:03 +01002359 unicode = PyUnicode_New(size, 127);
2360 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002361 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002362 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2363 assert(_PyUnicode_CheckConsistency(unicode, 1));
2364 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002365}
2366
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002367static Py_UCS4
2368kind_maxchar_limit(unsigned int kind)
2369{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002370 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002371 case PyUnicode_1BYTE_KIND:
2372 return 0x80;
2373 case PyUnicode_2BYTE_KIND:
2374 return 0x100;
2375 case PyUnicode_4BYTE_KIND:
2376 return 0x10000;
2377 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002378 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002379 }
2380}
2381
Victor Stinner702c7342011-10-05 13:50:52 +02002382static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002383_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002386 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002387
Victor Stinner2f9ada92020-06-24 02:22:21 +02002388 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002389 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002390 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002391 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002392 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002393 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002394 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002395
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002396 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 if (!res)
2399 return NULL;
2400 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002401 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002403}
2404
Victor Stinnere57b1c02011-09-28 22:20:48 +02002405static PyObject*
2406_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407{
2408 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002409 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002410
Serhiy Storchaka678db842013-01-26 12:16:36 +02002411 if (size == 0)
2412 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002413 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002414 if (size == 1)
2415 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002416
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002417 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002418 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 if (!res)
2420 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002421 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 else {
2424 _PyUnicode_CONVERT_BYTES(
2425 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2426 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002427 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 return res;
2429}
2430
Victor Stinnere57b1c02011-09-28 22:20:48 +02002431static PyObject*
2432_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433{
2434 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436
Serhiy Storchaka678db842013-01-26 12:16:36 +02002437 if (size == 0)
2438 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002439 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002440 if (size == 1)
2441 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002442
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002443 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002444 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 if (!res)
2446 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (max_char < 256)
2448 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2449 PyUnicode_1BYTE_DATA(res));
2450 else if (max_char < 0x10000)
2451 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2452 PyUnicode_2BYTE_DATA(res));
2453 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002455 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 return res;
2457}
2458
2459PyObject*
2460PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2461{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002462 if (size < 0) {
2463 PyErr_SetString(PyExc_ValueError, "size must be positive");
2464 return NULL;
2465 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002466 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002468 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002470 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002472 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002473 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002474 PyErr_SetString(PyExc_SystemError, "invalid kind");
2475 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477}
2478
Victor Stinnerece58de2012-04-23 23:36:38 +02002479Py_UCS4
2480_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2481{
2482 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002483 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002484
2485 assert(PyUnicode_IS_READY(unicode));
2486 assert(0 <= start);
2487 assert(end <= PyUnicode_GET_LENGTH(unicode));
2488 assert(start <= end);
2489
2490 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2491 return PyUnicode_MAX_CHAR_VALUE(unicode);
2492
2493 if (start == end)
2494 return 127;
2495
Victor Stinner94d558b2012-04-27 22:26:58 +02002496 if (PyUnicode_IS_ASCII(unicode))
2497 return 127;
2498
Victor Stinnerece58de2012-04-23 23:36:38 +02002499 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002500 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002501 endptr = (char *)startptr + end * kind;
2502 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002503 switch(kind) {
2504 case PyUnicode_1BYTE_KIND:
2505 return ucs1lib_find_max_char(startptr, endptr);
2506 case PyUnicode_2BYTE_KIND:
2507 return ucs2lib_find_max_char(startptr, endptr);
2508 case PyUnicode_4BYTE_KIND:
2509 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002510 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002511 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002512 }
2513}
2514
Victor Stinner25a4b292011-10-06 12:31:55 +02002515/* Ensure that a string uses the most efficient storage, if it is not the
2516 case: create a new string with of the right kind. Write NULL into *p_unicode
2517 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002518static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002519unicode_adjust_maxchar(PyObject **p_unicode)
2520{
2521 PyObject *unicode, *copy;
2522 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002523 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002524 unsigned int kind;
2525
2526 assert(p_unicode != NULL);
2527 unicode = *p_unicode;
2528 assert(PyUnicode_IS_READY(unicode));
2529 if (PyUnicode_IS_ASCII(unicode))
2530 return;
2531
2532 len = PyUnicode_GET_LENGTH(unicode);
2533 kind = PyUnicode_KIND(unicode);
2534 if (kind == PyUnicode_1BYTE_KIND) {
2535 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002536 max_char = ucs1lib_find_max_char(u, u + len);
2537 if (max_char >= 128)
2538 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002539 }
2540 else if (kind == PyUnicode_2BYTE_KIND) {
2541 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002542 max_char = ucs2lib_find_max_char(u, u + len);
2543 if (max_char >= 256)
2544 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002545 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002546 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002547 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002548 max_char = ucs4lib_find_max_char(u, u + len);
2549 if (max_char >= 0x10000)
2550 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002551 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002552 else
2553 Py_UNREACHABLE();
2554
Victor Stinner25a4b292011-10-06 12:31:55 +02002555 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002556 if (copy != NULL)
2557 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002558 Py_DECREF(unicode);
2559 *p_unicode = copy;
2560}
2561
Victor Stinner034f6cf2011-09-30 02:26:44 +02002562PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002563_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002564{
Victor Stinner87af4f22011-11-21 23:03:47 +01002565 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002566 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002567
Victor Stinner034f6cf2011-09-30 02:26:44 +02002568 if (!PyUnicode_Check(unicode)) {
2569 PyErr_BadInternalCall();
2570 return NULL;
2571 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002572 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002573 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002574
Victor Stinner87af4f22011-11-21 23:03:47 +01002575 length = PyUnicode_GET_LENGTH(unicode);
2576 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002577 if (!copy)
2578 return NULL;
2579 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2580
Christian Heimesf051e432016-09-13 20:22:02 +02002581 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002582 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002583 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002584 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002585}
2586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587
Victor Stinnerbc603d12011-10-02 01:00:40 +02002588/* Widen Unicode objects to larger buffers. Don't write terminating null
2589 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002591static void*
2592unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002594 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002595
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002596 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002597 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002598 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002599 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600 if (!result)
2601 return PyErr_NoMemory();
2602 assert(skind == PyUnicode_1BYTE_KIND);
2603 _PyUnicode_CONVERT_BYTES(
2604 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002605 (const Py_UCS1 *)data,
2606 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002607 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002609 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002610 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002611 if (!result)
2612 return PyErr_NoMemory();
2613 if (skind == PyUnicode_2BYTE_KIND) {
2614 _PyUnicode_CONVERT_BYTES(
2615 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002616 (const Py_UCS2 *)data,
2617 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002618 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002620 else {
2621 assert(skind == PyUnicode_1BYTE_KIND);
2622 _PyUnicode_CONVERT_BYTES(
2623 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002624 (const Py_UCS1 *)data,
2625 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002626 result);
2627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002629 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002630 Py_UNREACHABLE();
2631 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633}
2634
2635static Py_UCS4*
2636as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2637 int copy_null)
2638{
2639 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002640 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 Py_ssize_t len, targetlen;
2642 if (PyUnicode_READY(string) == -1)
2643 return NULL;
2644 kind = PyUnicode_KIND(string);
2645 data = PyUnicode_DATA(string);
2646 len = PyUnicode_GET_LENGTH(string);
2647 targetlen = len;
2648 if (copy_null)
2649 targetlen++;
2650 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002651 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 if (!target) {
2653 PyErr_NoMemory();
2654 return NULL;
2655 }
2656 }
2657 else {
2658 if (targetsize < targetlen) {
2659 PyErr_Format(PyExc_SystemError,
2660 "string is longer than the buffer");
2661 if (copy_null && 0 < targetsize)
2662 target[0] = 0;
2663 return NULL;
2664 }
2665 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002666 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002667 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002668 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002670 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002671 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002672 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2673 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002674 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002675 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002676 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002677 else {
2678 Py_UNREACHABLE();
2679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 if (copy_null)
2681 target[len] = 0;
2682 return target;
2683}
2684
2685Py_UCS4*
2686PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2687 int copy_null)
2688{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002689 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 PyErr_BadInternalCall();
2691 return NULL;
2692 }
2693 return as_ucs4(string, target, targetsize, copy_null);
2694}
2695
2696Py_UCS4*
2697PyUnicode_AsUCS4Copy(PyObject *string)
2698{
2699 return as_ucs4(string, NULL, 0, 1);
2700}
2701
Victor Stinner15a11362012-10-06 23:48:20 +02002702/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002703 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2704 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2705#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002706
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707static int
2708unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2709 Py_ssize_t width, Py_ssize_t precision)
2710{
2711 Py_ssize_t length, fill, arglen;
2712 Py_UCS4 maxchar;
2713
2714 if (PyUnicode_READY(str) == -1)
2715 return -1;
2716
2717 length = PyUnicode_GET_LENGTH(str);
2718 if ((precision == -1 || precision >= length)
2719 && width <= length)
2720 return _PyUnicodeWriter_WriteStr(writer, str);
2721
2722 if (precision != -1)
2723 length = Py_MIN(precision, length);
2724
2725 arglen = Py_MAX(length, width);
2726 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2727 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2728 else
2729 maxchar = writer->maxchar;
2730
2731 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2732 return -1;
2733
2734 if (width > length) {
2735 fill = width - length;
2736 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2737 return -1;
2738 writer->pos += fill;
2739 }
2740
2741 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2742 str, 0, length);
2743 writer->pos += length;
2744 return 0;
2745}
2746
2747static int
Victor Stinner998b8062018-09-12 00:23:25 +02002748unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002749 Py_ssize_t width, Py_ssize_t precision)
2750{
2751 /* UTF-8 */
2752 Py_ssize_t length;
2753 PyObject *unicode;
2754 int res;
2755
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002756 if (precision == -1) {
2757 length = strlen(str);
2758 }
2759 else {
2760 length = 0;
2761 while (length < precision && str[length]) {
2762 length++;
2763 }
2764 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2766 if (unicode == NULL)
2767 return -1;
2768
2769 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2770 Py_DECREF(unicode);
2771 return res;
2772}
2773
Victor Stinner96865452011-03-01 23:44:09 +00002774static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002775unicode_fromformat_arg(_PyUnicodeWriter *writer,
2776 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002777{
Victor Stinnere215d962012-10-06 23:03:36 +02002778 const char *p;
2779 Py_ssize_t len;
2780 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002781 Py_ssize_t width;
2782 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002783 int longflag;
2784 int longlongflag;
2785 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002787
2788 p = f;
2789 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002790 zeropad = 0;
2791 if (*f == '0') {
2792 zeropad = 1;
2793 f++;
2794 }
Victor Stinner96865452011-03-01 23:44:09 +00002795
2796 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002797 width = -1;
2798 if (Py_ISDIGIT((unsigned)*f)) {
2799 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002800 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002801 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002802 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002803 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002805 return NULL;
2806 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002808 f++;
2809 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 }
2811 precision = -1;
2812 if (*f == '.') {
2813 f++;
2814 if (Py_ISDIGIT((unsigned)*f)) {
2815 precision = (*f - '0');
2816 f++;
2817 while (Py_ISDIGIT((unsigned)*f)) {
2818 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2819 PyErr_SetString(PyExc_ValueError,
2820 "precision too big");
2821 return NULL;
2822 }
2823 precision = (precision * 10) + (*f - '0');
2824 f++;
2825 }
2826 }
Victor Stinner96865452011-03-01 23:44:09 +00002827 if (*f == '%') {
2828 /* "%.3%s" => f points to "3" */
2829 f--;
2830 }
2831 }
2832 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002833 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002834 f--;
2835 }
Victor Stinner96865452011-03-01 23:44:09 +00002836
2837 /* Handle %ld, %lu, %lld and %llu. */
2838 longflag = 0;
2839 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002840 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002841 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002842 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002843 longflag = 1;
2844 ++f;
2845 }
Victor Stinner96865452011-03-01 23:44:09 +00002846 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002847 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002848 longlongflag = 1;
2849 f += 2;
2850 }
Victor Stinner96865452011-03-01 23:44:09 +00002851 }
2852 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002853 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002854 size_tflag = 1;
2855 ++f;
2856 }
Victor Stinnere215d962012-10-06 23:03:36 +02002857
2858 if (f[1] == '\0')
2859 writer->overallocate = 0;
2860
2861 switch (*f) {
2862 case 'c':
2863 {
2864 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002865 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002866 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002867 "character argument not in range(0x110000)");
2868 return NULL;
2869 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002870 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002871 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002872 break;
2873 }
2874
2875 case 'i':
2876 case 'd':
2877 case 'u':
2878 case 'x':
2879 {
2880 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002881 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002882 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002885 if (longflag) {
2886 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2887 }
2888 else if (longlongflag) {
2889 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2890 }
2891 else if (size_tflag) {
2892 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2893 }
2894 else {
2895 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2896 }
Victor Stinnere215d962012-10-06 23:03:36 +02002897 }
2898 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002899 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002900 }
2901 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002902 if (longflag) {
2903 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2904 }
2905 else if (longlongflag) {
2906 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2907 }
2908 else if (size_tflag) {
2909 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2910 }
2911 else {
2912 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2913 }
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 assert(len >= 0);
2916
Victor Stinnere215d962012-10-06 23:03:36 +02002917 if (precision < len)
2918 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002919
2920 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002921 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2922 return NULL;
2923
Victor Stinnere215d962012-10-06 23:03:36 +02002924 if (width > precision) {
2925 Py_UCS4 fillchar;
2926 fill = width - precision;
2927 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002928 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2929 return NULL;
2930 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002931 }
Victor Stinner15a11362012-10-06 23:48:20 +02002932 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002933 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002934 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2935 return NULL;
2936 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002937 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002938
Victor Stinner4a587072013-11-19 12:54:53 +01002939 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2940 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002941 break;
2942 }
2943
2944 case 'p':
2945 {
2946 char number[MAX_LONG_LONG_CHARS];
2947
2948 len = sprintf(number, "%p", va_arg(*vargs, void*));
2949 assert(len >= 0);
2950
2951 /* %p is ill-defined: ensure leading 0x. */
2952 if (number[1] == 'X')
2953 number[1] = 'x';
2954 else if (number[1] != 'x') {
2955 memmove(number + 2, number,
2956 strlen(number) + 1);
2957 number[0] = '0';
2958 number[1] = 'x';
2959 len += 2;
2960 }
2961
Victor Stinner4a587072013-11-19 12:54:53 +01002962 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002963 return NULL;
2964 break;
2965 }
2966
2967 case 's':
2968 {
2969 /* UTF-8 */
2970 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002971 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002972 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002973 break;
2974 }
2975
2976 case 'U':
2977 {
2978 PyObject *obj = va_arg(*vargs, PyObject *);
2979 assert(obj && _PyUnicode_CHECK(obj));
2980
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002981 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002982 return NULL;
2983 break;
2984 }
2985
2986 case 'V':
2987 {
2988 PyObject *obj = va_arg(*vargs, PyObject *);
2989 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002990 if (obj) {
2991 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002992 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002993 return NULL;
2994 }
2995 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002996 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002997 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002998 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002999 }
3000 break;
3001 }
3002
3003 case 'S':
3004 {
3005 PyObject *obj = va_arg(*vargs, PyObject *);
3006 PyObject *str;
3007 assert(obj);
3008 str = PyObject_Str(obj);
3009 if (!str)
3010 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003011 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003012 Py_DECREF(str);
3013 return NULL;
3014 }
3015 Py_DECREF(str);
3016 break;
3017 }
3018
3019 case 'R':
3020 {
3021 PyObject *obj = va_arg(*vargs, PyObject *);
3022 PyObject *repr;
3023 assert(obj);
3024 repr = PyObject_Repr(obj);
3025 if (!repr)
3026 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003027 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003028 Py_DECREF(repr);
3029 return NULL;
3030 }
3031 Py_DECREF(repr);
3032 break;
3033 }
3034
3035 case 'A':
3036 {
3037 PyObject *obj = va_arg(*vargs, PyObject *);
3038 PyObject *ascii;
3039 assert(obj);
3040 ascii = PyObject_ASCII(obj);
3041 if (!ascii)
3042 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003043 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003044 Py_DECREF(ascii);
3045 return NULL;
3046 }
3047 Py_DECREF(ascii);
3048 break;
3049 }
3050
3051 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003052 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003053 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003054 break;
3055
3056 default:
3057 /* if we stumble upon an unknown formatting code, copy the rest
3058 of the format string to the output string. (we cannot just
3059 skip the code, since there's no way to know what's in the
3060 argument list) */
3061 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003062 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003063 return NULL;
3064 f = p+len;
3065 return f;
3066 }
3067
3068 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003069 return f;
3070}
3071
Walter Dörwaldd2034312007-05-18 16:29:38 +00003072PyObject *
3073PyUnicode_FromFormatV(const char *format, va_list vargs)
3074{
Victor Stinnere215d962012-10-06 23:03:36 +02003075 va_list vargs2;
3076 const char *f;
3077 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003078
Victor Stinner8f674cc2013-04-17 23:02:17 +02003079 _PyUnicodeWriter_Init(&writer);
3080 writer.min_length = strlen(format) + 100;
3081 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003082
Benjamin Peterson0c212142016-09-20 20:39:33 -07003083 // Copy varags to be able to pass a reference to a subfunction.
3084 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003085
3086 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003088 f = unicode_fromformat_arg(&writer, f, &vargs2);
3089 if (f == NULL)
3090 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 const char *p;
3094 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003095
Victor Stinnere215d962012-10-06 23:03:36 +02003096 p = f;
3097 do
3098 {
3099 if ((unsigned char)*p > 127) {
3100 PyErr_Format(PyExc_ValueError,
3101 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3102 "string, got a non-ASCII byte: 0x%02x",
3103 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003104 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003105 }
3106 p++;
3107 }
3108 while (*p != '\0' && *p != '%');
3109 len = p - f;
3110
3111 if (*p == '\0')
3112 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003113
3114 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003115 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003116
3117 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003119 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003120 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003121 return _PyUnicodeWriter_Finish(&writer);
3122
3123 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003124 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003125 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003126 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127}
3128
Walter Dörwaldd2034312007-05-18 16:29:38 +00003129PyObject *
3130PyUnicode_FromFormat(const char *format, ...)
3131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003132 PyObject* ret;
3133 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003134
3135#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003137#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003138 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003139#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003140 ret = PyUnicode_FromFormatV(format, vargs);
3141 va_end(vargs);
3142 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003143}
3144
Serhiy Storchakac46db922018-10-23 22:58:24 +03003145static Py_ssize_t
3146unicode_get_widechar_size(PyObject *unicode)
3147{
3148 Py_ssize_t res;
3149
3150 assert(unicode != NULL);
3151 assert(_PyUnicode_CHECK(unicode));
3152
3153 if (_PyUnicode_WSTR(unicode) != NULL) {
3154 return PyUnicode_WSTR_LENGTH(unicode);
3155 }
3156 assert(PyUnicode_IS_READY(unicode));
3157
3158 res = _PyUnicode_LENGTH(unicode);
3159#if SIZEOF_WCHAR_T == 2
3160 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3161 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3162 const Py_UCS4 *end = s + res;
3163 for (; s < end; ++s) {
3164 if (*s > 0xFFFF) {
3165 ++res;
3166 }
3167 }
3168 }
3169#endif
3170 return res;
3171}
3172
3173static void
3174unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3175{
3176 const wchar_t *wstr;
3177
3178 assert(unicode != NULL);
3179 assert(_PyUnicode_CHECK(unicode));
3180
3181 wstr = _PyUnicode_WSTR(unicode);
3182 if (wstr != NULL) {
3183 memcpy(w, wstr, size * sizeof(wchar_t));
3184 return;
3185 }
3186 assert(PyUnicode_IS_READY(unicode));
3187
3188 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3189 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3190 for (; size--; ++s, ++w) {
3191 *w = *s;
3192 }
3193 }
3194 else {
3195#if SIZEOF_WCHAR_T == 4
3196 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3197 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3198 for (; size--; ++s, ++w) {
3199 *w = *s;
3200 }
3201#else
3202 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3203 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3204 for (; size--; ++s, ++w) {
3205 Py_UCS4 ch = *s;
3206 if (ch > 0xFFFF) {
3207 assert(ch <= MAX_UNICODE);
3208 /* encode surrogate pair in this case */
3209 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3210 if (!size--)
3211 break;
3212 *w = Py_UNICODE_LOW_SURROGATE(ch);
3213 }
3214 else {
3215 *w = ch;
3216 }
3217 }
3218#endif
3219 }
3220}
3221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003222#ifdef HAVE_WCHAR_H
3223
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003224/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003225
Victor Stinnerd88d9832011-09-06 02:00:05 +02003226 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003227 character) required to convert the unicode object. Ignore size argument.
3228
Victor Stinnerd88d9832011-09-06 02:00:05 +02003229 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003230 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003231 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003232Py_ssize_t
3233PyUnicode_AsWideChar(PyObject *unicode,
3234 wchar_t *w,
3235 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003236{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003237 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003238
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003239 if (unicode == NULL) {
3240 PyErr_BadInternalCall();
3241 return -1;
3242 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003243 if (!PyUnicode_Check(unicode)) {
3244 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003245 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003246 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003247
3248 res = unicode_get_widechar_size(unicode);
3249 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003250 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003251 }
3252
3253 if (size > res) {
3254 size = res + 1;
3255 }
3256 else {
3257 res = size;
3258 }
3259 unicode_copy_as_widechar(unicode, w, size);
3260 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003261}
3262
Victor Stinner137c34c2010-09-29 10:25:54 +00003263wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003264PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003265 Py_ssize_t *size)
3266{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003267 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003268 Py_ssize_t buflen;
3269
3270 if (unicode == NULL) {
3271 PyErr_BadInternalCall();
3272 return NULL;
3273 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003274 if (!PyUnicode_Check(unicode)) {
3275 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003276 return NULL;
3277 }
3278
Serhiy Storchakac46db922018-10-23 22:58:24 +03003279 buflen = unicode_get_widechar_size(unicode);
3280 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003281 if (buffer == NULL) {
3282 PyErr_NoMemory();
3283 return NULL;
3284 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003285 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3286 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003287 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003288 }
3289 else if (wcslen(buffer) != (size_t)buflen) {
3290 PyMem_FREE(buffer);
3291 PyErr_SetString(PyExc_ValueError,
3292 "embedded null character");
3293 return NULL;
3294 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003295 return buffer;
3296}
3297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003298#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003300int
3301_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3302{
3303 wchar_t **p = (wchar_t **)ptr;
3304 if (obj == NULL) {
3305#if !USE_UNICODE_WCHAR_CACHE
3306 PyMem_Free(*p);
3307#endif /* USE_UNICODE_WCHAR_CACHE */
3308 *p = NULL;
3309 return 1;
3310 }
3311 if (PyUnicode_Check(obj)) {
3312#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003313 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3314 if (*p == NULL) {
3315 return 0;
3316 }
3317 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003318#else /* USE_UNICODE_WCHAR_CACHE */
3319 *p = PyUnicode_AsWideCharString(obj, NULL);
3320 if (*p == NULL) {
3321 return 0;
3322 }
3323 return Py_CLEANUP_SUPPORTED;
3324#endif /* USE_UNICODE_WCHAR_CACHE */
3325 }
3326 PyErr_Format(PyExc_TypeError,
3327 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003328 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003329 return 0;
3330}
3331
3332int
3333_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3334{
3335 wchar_t **p = (wchar_t **)ptr;
3336 if (obj == NULL) {
3337#if !USE_UNICODE_WCHAR_CACHE
3338 PyMem_Free(*p);
3339#endif /* USE_UNICODE_WCHAR_CACHE */
3340 *p = NULL;
3341 return 1;
3342 }
3343 if (obj == Py_None) {
3344 *p = NULL;
3345 return 1;
3346 }
3347 if (PyUnicode_Check(obj)) {
3348#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003349 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3350 if (*p == NULL) {
3351 return 0;
3352 }
3353 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003354#else /* USE_UNICODE_WCHAR_CACHE */
3355 *p = PyUnicode_AsWideCharString(obj, NULL);
3356 if (*p == NULL) {
3357 return 0;
3358 }
3359 return Py_CLEANUP_SUPPORTED;
3360#endif /* USE_UNICODE_WCHAR_CACHE */
3361 }
3362 PyErr_Format(PyExc_TypeError,
3363 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003364 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003365 return 0;
3366}
3367
Alexander Belopolsky40018472011-02-26 01:02:56 +00003368PyObject *
3369PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003370{
Victor Stinner8faf8212011-12-08 22:14:11 +01003371 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 PyErr_SetString(PyExc_ValueError,
3373 "chr() arg not in range(0x110000)");
3374 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003375 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003376
Victor Stinner985a82a2014-01-03 12:53:47 +01003377 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003378}
3379
Alexander Belopolsky40018472011-02-26 01:02:56 +00003380PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003381PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003385 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003386 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003387 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003388 Py_INCREF(obj);
3389 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003390 }
3391 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003392 /* For a Unicode subtype that's not a Unicode object,
3393 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003394 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003395 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003396 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003397 "Can't convert '%.100s' object to str implicitly",
3398 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003399 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003400}
3401
Alexander Belopolsky40018472011-02-26 01:02:56 +00003402PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003403PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003404 const char *encoding,
3405 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003406{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003407 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003408 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003409
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 PyErr_BadInternalCall();
3412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003414
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003415 /* Decoding bytes objects is the most common case and should be fast */
3416 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003417 if (PyBytes_GET_SIZE(obj) == 0) {
3418 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3419 return NULL;
3420 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003421 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003422 }
3423 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003424 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3425 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003426 }
3427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003429 PyErr_SetString(PyExc_TypeError,
3430 "decoding str is not supported");
3431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3436 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003437 "decoding to str: need a bytes-like object, %.80s found",
3438 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003439 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003440 }
Tim Petersced69f82003-09-16 20:30:58 +00003441
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003442 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003443 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003444 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3445 return NULL;
3446 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003447 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003449
Serhiy Storchaka05997252013-01-26 12:14:02 +02003450 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003451 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003452 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453}
3454
Victor Stinnerebe17e02016-10-12 13:57:45 +02003455/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3456 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3457 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003458int
3459_Py_normalize_encoding(const char *encoding,
3460 char *lower,
3461 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003463 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003464 char *l;
3465 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003466 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467
Victor Stinner942889a2016-09-05 15:40:10 -07003468 assert(encoding != NULL);
3469
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003470 e = encoding;
3471 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003472 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003473 punct = 0;
3474 while (1) {
3475 char c = *e;
3476 if (c == 0) {
3477 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003478 }
Victor Stinner942889a2016-09-05 15:40:10 -07003479
3480 if (Py_ISALNUM(c) || c == '.') {
3481 if (punct && l != lower) {
3482 if (l == l_end) {
3483 return 0;
3484 }
3485 *l++ = '_';
3486 }
3487 punct = 0;
3488
3489 if (l == l_end) {
3490 return 0;
3491 }
3492 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003493 }
3494 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003495 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003496 }
Victor Stinner942889a2016-09-05 15:40:10 -07003497
3498 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003499 }
3500 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003501 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003502}
3503
Alexander Belopolsky40018472011-02-26 01:02:56 +00003504PyObject *
3505PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003506 Py_ssize_t size,
3507 const char *encoding,
3508 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003509{
3510 PyObject *buffer = NULL, *unicode;
3511 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003512 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3513
Victor Stinner22eb6892019-06-26 00:51:05 +02003514 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3515 return NULL;
3516 }
3517
Victor Stinnered076ed2019-06-26 01:49:32 +02003518 if (size == 0) {
3519 _Py_RETURN_UNICODE_EMPTY();
3520 }
3521
Victor Stinner942889a2016-09-05 15:40:10 -07003522 if (encoding == NULL) {
3523 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3524 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003525
Fred Drakee4315f52000-05-09 19:53:39 +00003526 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003527 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3528 char *lower = buflower;
3529
3530 /* Fast paths */
3531 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3532 lower += 3;
3533 if (*lower == '_') {
3534 /* Match "utf8" and "utf_8" */
3535 lower++;
3536 }
3537
3538 if (lower[0] == '8' && lower[1] == 0) {
3539 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3540 }
3541 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3542 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3543 }
3544 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3545 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3546 }
3547 }
3548 else {
3549 if (strcmp(lower, "ascii") == 0
3550 || strcmp(lower, "us_ascii") == 0) {
3551 return PyUnicode_DecodeASCII(s, size, errors);
3552 }
Steve Dowercc16be82016-09-08 10:35:16 -07003553 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003554 else if (strcmp(lower, "mbcs") == 0) {
3555 return PyUnicode_DecodeMBCS(s, size, errors);
3556 }
3557 #endif
3558 else if (strcmp(lower, "latin1") == 0
3559 || strcmp(lower, "latin_1") == 0
3560 || strcmp(lower, "iso_8859_1") == 0
3561 || strcmp(lower, "iso8859_1") == 0) {
3562 return PyUnicode_DecodeLatin1(s, size, errors);
3563 }
3564 }
Victor Stinner37296e82010-06-10 13:36:23 +00003565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566
3567 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003568 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003569 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003570 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003571 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 if (buffer == NULL)
3573 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003574 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 if (unicode == NULL)
3576 goto onError;
3577 if (!PyUnicode_Check(unicode)) {
3578 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003579 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003580 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003581 encoding,
3582 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 Py_DECREF(unicode);
3584 goto onError;
3585 }
3586 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003587 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003588
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 Py_XDECREF(buffer);
3591 return NULL;
3592}
3593
Alexander Belopolsky40018472011-02-26 01:02:56 +00003594PyObject *
3595PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003596 const char *encoding,
3597 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003598{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003599 if (!PyUnicode_Check(unicode)) {
3600 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003601 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003602 }
3603
Serhiy Storchaka00939072016-10-27 21:05:49 +03003604 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3605 "PyUnicode_AsDecodedObject() is deprecated; "
3606 "use PyCodec_Decode() to decode from str", 1) < 0)
3607 return NULL;
3608
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003609 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003610 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003611
3612 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003613 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003614}
3615
Alexander Belopolsky40018472011-02-26 01:02:56 +00003616PyObject *
3617PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003618 const char *encoding,
3619 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003620{
3621 PyObject *v;
3622
3623 if (!PyUnicode_Check(unicode)) {
3624 PyErr_BadArgument();
3625 goto onError;
3626 }
3627
Serhiy Storchaka00939072016-10-27 21:05:49 +03003628 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3629 "PyUnicode_AsDecodedUnicode() is deprecated; "
3630 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3631 return NULL;
3632
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003633 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003635
3636 /* Decode via the codec registry */
3637 v = PyCodec_Decode(unicode, encoding, errors);
3638 if (v == NULL)
3639 goto onError;
3640 if (!PyUnicode_Check(v)) {
3641 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003642 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003643 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003644 encoding,
3645 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003646 Py_DECREF(v);
3647 goto onError;
3648 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003649 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003650
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003652 return NULL;
3653}
3654
Alexander Belopolsky40018472011-02-26 01:02:56 +00003655PyObject *
3656PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003657 Py_ssize_t size,
3658 const char *encoding,
3659 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660{
3661 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003662
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003663 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3667 Py_DECREF(unicode);
3668 return v;
3669}
3670
Alexander Belopolsky40018472011-02-26 01:02:56 +00003671PyObject *
3672PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003673 const char *encoding,
3674 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003675{
3676 PyObject *v;
3677
3678 if (!PyUnicode_Check(unicode)) {
3679 PyErr_BadArgument();
3680 goto onError;
3681 }
3682
Serhiy Storchaka00939072016-10-27 21:05:49 +03003683 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3684 "PyUnicode_AsEncodedObject() is deprecated; "
3685 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3686 "or PyCodec_Encode() for generic encoding", 1) < 0)
3687 return NULL;
3688
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003689 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003691
3692 /* Encode via the codec registry */
3693 v = PyCodec_Encode(unicode, encoding, errors);
3694 if (v == NULL)
3695 goto onError;
3696 return v;
3697
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003699 return NULL;
3700}
3701
Victor Stinner1b579672011-12-17 05:47:23 +01003702
Victor Stinner2cba6b82018-01-10 22:46:15 +01003703static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003704unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003705 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003706{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003707 Py_ssize_t wlen;
3708 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3709 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003710 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003711 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003712
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003713 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003714 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003715 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003716 return NULL;
3717 }
3718
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003719 char *str;
3720 size_t error_pos;
3721 const char *reason;
3722 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003723 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003724 PyMem_Free(wstr);
3725
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003726 if (res != 0) {
3727 if (res == -2) {
3728 PyObject *exc;
3729 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3730 "locale", unicode,
3731 (Py_ssize_t)error_pos,
3732 (Py_ssize_t)(error_pos+1),
3733 reason);
3734 if (exc != NULL) {
3735 PyCodec_StrictErrors(exc);
3736 Py_DECREF(exc);
3737 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003738 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003739 else if (res == -3) {
3740 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3741 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003742 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003743 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003744 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003745 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003746 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003747
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003748 PyObject *bytes = PyBytes_FromString(str);
3749 PyMem_RawFree(str);
3750 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003751}
3752
Victor Stinnerad158722010-10-27 00:25:46 +00003753PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003754PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3755{
Victor Stinner709d23d2019-05-02 14:56:30 -04003756 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3757 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003758}
3759
3760PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003761PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003762{
Victor Stinner81a7be32020-04-14 15:14:01 +02003763 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003764 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3765 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003766 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003767 fs_codec->error_handler,
3768 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003769 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003770#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003771 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003772 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003773 fs_codec->encoding,
3774 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003775 }
Victor Stinnerad158722010-10-27 00:25:46 +00003776#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003777 else {
3778 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3779 machinery is not ready and so cannot be used:
3780 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003781 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3782 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003783 assert(filesystem_errors != NULL);
3784 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3785 assert(errors != _Py_ERROR_UNKNOWN);
3786#ifdef _Py_FORCE_UTF8_FS_ENCODING
3787 return unicode_encode_utf8(unicode, errors, NULL);
3788#else
3789 return unicode_encode_locale(unicode, errors, 0);
3790#endif
3791 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003792}
3793
Alexander Belopolsky40018472011-02-26 01:02:56 +00003794PyObject *
3795PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003796 const char *encoding,
3797 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798{
3799 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003800 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003801
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 if (!PyUnicode_Check(unicode)) {
3803 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 }
Fred Drakee4315f52000-05-09 19:53:39 +00003806
Victor Stinner22eb6892019-06-26 00:51:05 +02003807 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3808 return NULL;
3809 }
3810
Victor Stinner942889a2016-09-05 15:40:10 -07003811 if (encoding == NULL) {
3812 return _PyUnicode_AsUTF8String(unicode, errors);
3813 }
3814
Fred Drakee4315f52000-05-09 19:53:39 +00003815 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003816 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3817 char *lower = buflower;
3818
3819 /* Fast paths */
3820 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3821 lower += 3;
3822 if (*lower == '_') {
3823 /* Match "utf8" and "utf_8" */
3824 lower++;
3825 }
3826
3827 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003829 }
3830 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3831 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3832 }
3833 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3834 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3835 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003836 }
Victor Stinner942889a2016-09-05 15:40:10 -07003837 else {
3838 if (strcmp(lower, "ascii") == 0
3839 || strcmp(lower, "us_ascii") == 0) {
3840 return _PyUnicode_AsASCIIString(unicode, errors);
3841 }
Steve Dowercc16be82016-09-08 10:35:16 -07003842#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003843 else if (strcmp(lower, "mbcs") == 0) {
3844 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3845 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003846#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003847 else if (strcmp(lower, "latin1") == 0 ||
3848 strcmp(lower, "latin_1") == 0 ||
3849 strcmp(lower, "iso_8859_1") == 0 ||
3850 strcmp(lower, "iso8859_1") == 0) {
3851 return _PyUnicode_AsLatin1String(unicode, errors);
3852 }
3853 }
Victor Stinner37296e82010-06-10 13:36:23 +00003854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855
3856 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003857 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003859 return NULL;
3860
3861 /* The normal path */
3862 if (PyBytes_Check(v))
3863 return v;
3864
3865 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003866 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003867 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003868 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003869
3870 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003871 "encoder %s returned bytearray instead of bytes; "
3872 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003873 encoding);
3874 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003875 Py_DECREF(v);
3876 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003877 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003878
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003879 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3880 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003881 Py_DECREF(v);
3882 return b;
3883 }
3884
3885 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003886 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003887 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003888 encoding,
3889 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003890 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003891 return NULL;
3892}
3893
Alexander Belopolsky40018472011-02-26 01:02:56 +00003894PyObject *
3895PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003896 const char *encoding,
3897 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003898{
3899 PyObject *v;
3900
3901 if (!PyUnicode_Check(unicode)) {
3902 PyErr_BadArgument();
3903 goto onError;
3904 }
3905
Serhiy Storchaka00939072016-10-27 21:05:49 +03003906 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3907 "PyUnicode_AsEncodedUnicode() is deprecated; "
3908 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3909 return NULL;
3910
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003911 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003913
3914 /* Encode via the codec registry */
3915 v = PyCodec_Encode(unicode, encoding, errors);
3916 if (v == NULL)
3917 goto onError;
3918 if (!PyUnicode_Check(v)) {
3919 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003920 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003921 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003922 encoding,
3923 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003924 Py_DECREF(v);
3925 goto onError;
3926 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003928
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 return NULL;
3931}
3932
Victor Stinner2cba6b82018-01-10 22:46:15 +01003933static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003934unicode_decode_locale(const char *str, Py_ssize_t len,
3935 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003936{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003937 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3938 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003939 return NULL;
3940 }
3941
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003942 wchar_t *wstr;
3943 size_t wlen;
3944 const char *reason;
3945 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003946 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003947 if (res != 0) {
3948 if (res == -2) {
3949 PyObject *exc;
3950 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3951 "locale", str, len,
3952 (Py_ssize_t)wlen,
3953 (Py_ssize_t)(wlen + 1),
3954 reason);
3955 if (exc != NULL) {
3956 PyCodec_StrictErrors(exc);
3957 Py_DECREF(exc);
3958 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003959 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003960 else if (res == -3) {
3961 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3962 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003963 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003964 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003965 }
Victor Stinner2f197072011-12-17 07:08:30 +01003966 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003967 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003968
3969 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3970 PyMem_RawFree(wstr);
3971 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003972}
3973
3974PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003975PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3976 const char *errors)
3977{
Victor Stinner709d23d2019-05-02 14:56:30 -04003978 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3979 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003980}
3981
3982PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003983PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003984{
3985 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003986 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3987 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003988}
3989
3990
3991PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003992PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003993 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003994 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3995}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003996
Christian Heimes5894ba72007-11-04 11:43:14 +00003997PyObject*
3998PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3999{
Victor Stinner81a7be32020-04-14 15:14:01 +02004000 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004001 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4002 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004003 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004004 fs_codec->error_handler,
4005 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004006 NULL);
4007 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004008#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004009 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004010 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004011 fs_codec->encoding,
4012 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004013 }
Victor Stinnerad158722010-10-27 00:25:46 +00004014#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004015 else {
4016 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4017 machinery is not ready and so cannot be used:
4018 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004019 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4020 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004021 assert(filesystem_errors != NULL);
4022 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4023 assert(errors != _Py_ERROR_UNKNOWN);
4024#ifdef _Py_FORCE_UTF8_FS_ENCODING
4025 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4026#else
4027 return unicode_decode_locale(s, size, errors, 0);
4028#endif
4029 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004030}
4031
Martin v. Löwis011e8422009-05-05 04:43:17 +00004032
4033int
4034PyUnicode_FSConverter(PyObject* arg, void* addr)
4035{
Brett Cannonec6ce872016-09-06 15:50:29 -07004036 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004037 PyObject *output = NULL;
4038 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004039 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004040 if (arg == NULL) {
4041 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004042 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004043 return 1;
4044 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004045 path = PyOS_FSPath(arg);
4046 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004047 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004048 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004049 if (PyBytes_Check(path)) {
4050 output = path;
4051 }
4052 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4053 output = PyUnicode_EncodeFSDefault(path);
4054 Py_DECREF(path);
4055 if (!output) {
4056 return 0;
4057 }
4058 assert(PyBytes_Check(output));
4059 }
4060
Victor Stinner0ea2a462010-04-30 00:22:08 +00004061 size = PyBytes_GET_SIZE(output);
4062 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004063 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004064 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004065 Py_DECREF(output);
4066 return 0;
4067 }
4068 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004069 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004070}
4071
4072
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004073int
4074PyUnicode_FSDecoder(PyObject* arg, void* addr)
4075{
Brett Cannona5711202016-09-06 19:36:01 -07004076 int is_buffer = 0;
4077 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004078 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004079 if (arg == NULL) {
4080 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004081 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004082 return 1;
4083 }
Brett Cannona5711202016-09-06 19:36:01 -07004084
4085 is_buffer = PyObject_CheckBuffer(arg);
4086 if (!is_buffer) {
4087 path = PyOS_FSPath(arg);
4088 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004089 return 0;
4090 }
Brett Cannona5711202016-09-06 19:36:01 -07004091 }
4092 else {
4093 path = arg;
4094 Py_INCREF(arg);
4095 }
4096
4097 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004098 output = path;
4099 }
4100 else if (PyBytes_Check(path) || is_buffer) {
4101 PyObject *path_bytes = NULL;
4102
4103 if (!PyBytes_Check(path) &&
4104 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004105 "path should be string, bytes, or os.PathLike, not %.200s",
4106 Py_TYPE(arg)->tp_name)) {
4107 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004108 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004109 }
4110 path_bytes = PyBytes_FromObject(path);
4111 Py_DECREF(path);
4112 if (!path_bytes) {
4113 return 0;
4114 }
4115 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4116 PyBytes_GET_SIZE(path_bytes));
4117 Py_DECREF(path_bytes);
4118 if (!output) {
4119 return 0;
4120 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004121 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004122 else {
4123 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004124 "path should be string, bytes, or os.PathLike, not %.200s",
4125 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004126 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004127 return 0;
4128 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004129 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004130 Py_DECREF(output);
4131 return 0;
4132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004134 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004135 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004136 Py_DECREF(output);
4137 return 0;
4138 }
4139 *(PyObject**)addr = output;
4140 return Py_CLEANUP_SUPPORTED;
4141}
4142
4143
Inada Naoki02a4d572020-02-27 13:48:59 +09004144static int unicode_fill_utf8(PyObject *unicode);
4145
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004146const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004148{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004149 if (!PyUnicode_Check(unicode)) {
4150 PyErr_BadArgument();
4151 return NULL;
4152 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004153 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004154 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004155
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004156 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004157 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 return NULL;
4159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004160 }
4161
4162 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004163 *psize = PyUnicode_UTF8_LENGTH(unicode);
4164 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004165}
4166
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004167const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004168PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4171}
4172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173Py_UNICODE *
4174PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4175{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004176 if (!PyUnicode_Check(unicode)) {
4177 PyErr_BadArgument();
4178 return NULL;
4179 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004180 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4181 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004183 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004184 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185
Serhiy Storchakac46db922018-10-23 22:58:24 +03004186 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4187 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4188 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004190 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004191 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4192 if (w == NULL) {
4193 PyErr_NoMemory();
4194 return NULL;
4195 }
4196 unicode_copy_as_widechar(unicode, w, wlen + 1);
4197 _PyUnicode_WSTR(unicode) = w;
4198 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4199 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004200 }
4201 }
4202 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004203 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004204 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004205}
4206
Inada Naoki2c4928d2020-06-17 20:09:44 +09004207/* Deprecated APIs */
4208
4209_Py_COMP_DIAG_PUSH
4210_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4211
Alexander Belopolsky40018472011-02-26 01:02:56 +00004212Py_UNICODE *
4213PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004215 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004216}
4217
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004218const Py_UNICODE *
4219_PyUnicode_AsUnicode(PyObject *unicode)
4220{
4221 Py_ssize_t size;
4222 const Py_UNICODE *wstr;
4223
4224 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4225 if (wstr && wcslen(wstr) != (size_t)size) {
4226 PyErr_SetString(PyExc_ValueError, "embedded null character");
4227 return NULL;
4228 }
4229 return wstr;
4230}
4231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232
Alexander Belopolsky40018472011-02-26 01:02:56 +00004233Py_ssize_t
4234PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235{
4236 if (!PyUnicode_Check(unicode)) {
4237 PyErr_BadArgument();
4238 goto onError;
4239 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004240 if (_PyUnicode_WSTR(unicode) == NULL) {
4241 if (PyUnicode_AsUnicode(unicode) == NULL)
4242 goto onError;
4243 }
4244 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 return -1;
4248}
4249
Inada Naoki2c4928d2020-06-17 20:09:44 +09004250_Py_COMP_DIAG_POP
4251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004252Py_ssize_t
4253PyUnicode_GetLength(PyObject *unicode)
4254{
Victor Stinner07621332012-06-16 04:53:46 +02004255 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004256 PyErr_BadArgument();
4257 return -1;
4258 }
Victor Stinner07621332012-06-16 04:53:46 +02004259 if (PyUnicode_READY(unicode) == -1)
4260 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004261 return PyUnicode_GET_LENGTH(unicode);
4262}
4263
4264Py_UCS4
4265PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4266{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004267 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004268 int kind;
4269
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004270 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004271 PyErr_BadArgument();
4272 return (Py_UCS4)-1;
4273 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004274 if (PyUnicode_READY(unicode) == -1) {
4275 return (Py_UCS4)-1;
4276 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004277 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004278 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 return (Py_UCS4)-1;
4280 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004281 data = PyUnicode_DATA(unicode);
4282 kind = PyUnicode_KIND(unicode);
4283 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284}
4285
4286int
4287PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4288{
4289 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004290 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 return -1;
4292 }
Victor Stinner488fa492011-12-12 00:01:39 +01004293 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004294 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004295 PyErr_SetString(PyExc_IndexError, "string index out of range");
4296 return -1;
4297 }
Victor Stinner488fa492011-12-12 00:01:39 +01004298 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004299 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004300 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4301 PyErr_SetString(PyExc_ValueError, "character out of range");
4302 return -1;
4303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004304 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4305 index, ch);
4306 return 0;
4307}
4308
Alexander Belopolsky40018472011-02-26 01:02:56 +00004309const char *
4310PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004311{
Victor Stinner42cb4622010-09-01 19:39:01 +00004312 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004313}
4314
Victor Stinner554f3f02010-06-16 23:33:54 +00004315/* create or adjust a UnicodeDecodeError */
4316static void
4317make_decode_exception(PyObject **exceptionObject,
4318 const char *encoding,
4319 const char *input, Py_ssize_t length,
4320 Py_ssize_t startpos, Py_ssize_t endpos,
4321 const char *reason)
4322{
4323 if (*exceptionObject == NULL) {
4324 *exceptionObject = PyUnicodeDecodeError_Create(
4325 encoding, input, length, startpos, endpos, reason);
4326 }
4327 else {
4328 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4329 goto onError;
4330 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4331 goto onError;
4332 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4333 goto onError;
4334 }
4335 return;
4336
4337onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004338 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004339}
4340
Steve Dowercc16be82016-09-08 10:35:16 -07004341#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004342static int
4343widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4344{
4345 if (newsize > *size) {
4346 wchar_t *newbuf = *buf;
4347 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4348 PyErr_NoMemory();
4349 return -1;
4350 }
4351 *buf = newbuf;
4352 }
4353 *size = newsize;
4354 return 0;
4355}
4356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357/* error handling callback helper:
4358 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004359 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004360 and adjust various state variables.
4361 return 0 on success, -1 on error
4362*/
4363
Alexander Belopolsky40018472011-02-26 01:02:56 +00004364static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004365unicode_decode_call_errorhandler_wchar(
4366 const char *errors, PyObject **errorHandler,
4367 const char *encoding, const char *reason,
4368 const char **input, const char **inend, Py_ssize_t *startinpos,
4369 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004370 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004372 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004373
4374 PyObject *restuple = NULL;
4375 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004376 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004377 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004378 Py_ssize_t requiredsize;
4379 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004380 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004381 wchar_t *repwstr;
4382 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383
4384 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 *errorHandler = PyCodec_LookupError(errors);
4386 if (*errorHandler == NULL)
4387 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004388 }
4389
Victor Stinner554f3f02010-06-16 23:33:54 +00004390 make_decode_exception(exceptionObject,
4391 encoding,
4392 *input, *inend - *input,
4393 *startinpos, *endinpos,
4394 reason);
4395 if (*exceptionObject == NULL)
4396 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397
Petr Viktorinffd97532020-02-11 17:46:57 +01004398 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004401 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004402 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004405 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004407
4408 /* Copy back the bytes variables, which might have been modified by the
4409 callback */
4410 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4411 if (!inputobj)
4412 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 *input = PyBytes_AS_STRING(inputobj);
4414 insize = PyBytes_GET_SIZE(inputobj);
4415 *inend = *input + insize;
4416 /* we can DECREF safely, as the exception has another reference,
4417 so the object won't go away. */
4418 Py_DECREF(inputobj);
4419
4420 if (newpos<0)
4421 newpos = insize+newpos;
4422 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004423 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 goto onError;
4425 }
4426
4427 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4428 if (repwstr == NULL)
4429 goto onError;
4430 /* need more space? (at least enough for what we
4431 have+the replacement+the rest of the string (starting
4432 at the new input position), so we won't have to check space
4433 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004434 requiredsize = *outpos;
4435 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4436 goto overflow;
4437 requiredsize += repwlen;
4438 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4439 goto overflow;
4440 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004441 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004442 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004443 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004445 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004447 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004449 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 *endinpos = newpos;
4452 *inptr = *input + newpos;
4453
4454 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004455 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456 return 0;
4457
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004458 overflow:
4459 PyErr_SetString(PyExc_OverflowError,
4460 "decoded result is too long for a Python string");
4461
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 onError:
4463 Py_XDECREF(restuple);
4464 return -1;
4465}
Steve Dowercc16be82016-09-08 10:35:16 -07004466#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467
4468static int
4469unicode_decode_call_errorhandler_writer(
4470 const char *errors, PyObject **errorHandler,
4471 const char *encoding, const char *reason,
4472 const char **input, const char **inend, Py_ssize_t *startinpos,
4473 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4474 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4475{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004476 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477
4478 PyObject *restuple = NULL;
4479 PyObject *repunicode = NULL;
4480 Py_ssize_t insize;
4481 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004482 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004483 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004485 int need_to_grow = 0;
4486 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004487
4488 if (*errorHandler == NULL) {
4489 *errorHandler = PyCodec_LookupError(errors);
4490 if (*errorHandler == NULL)
4491 goto onError;
4492 }
4493
4494 make_decode_exception(exceptionObject,
4495 encoding,
4496 *input, *inend - *input,
4497 *startinpos, *endinpos,
4498 reason);
4499 if (*exceptionObject == NULL)
4500 goto onError;
4501
Petr Viktorinffd97532020-02-11 17:46:57 +01004502 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004503 if (restuple == NULL)
4504 goto onError;
4505 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004506 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004507 goto onError;
4508 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004509 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004510 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004511
4512 /* Copy back the bytes variables, which might have been modified by the
4513 callback */
4514 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4515 if (!inputobj)
4516 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004517 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004518 *input = PyBytes_AS_STRING(inputobj);
4519 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004520 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004521 /* we can DECREF safely, as the exception has another reference,
4522 so the object won't go away. */
4523 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004527 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004528 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531
Victor Stinner170ca6f2013-04-18 00:25:28 +02004532 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004533 if (replen > 1) {
4534 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004535 need_to_grow = 1;
4536 }
4537 new_inptr = *input + newpos;
4538 if (*inend - new_inptr > remain) {
4539 /* We don't know the decoding algorithm here so we make the worst
4540 assumption that one byte decodes to one unicode character.
4541 If unfortunately one byte could decode to more unicode characters,
4542 the decoder may write out-of-bound then. Is it possible for the
4543 algorithms using this function? */
4544 writer->min_length += *inend - new_inptr - remain;
4545 need_to_grow = 1;
4546 }
4547 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004548 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004549 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004550 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4551 goto onError;
4552 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004553 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004554 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004557 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004560 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004565 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566}
4567
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004568/* --- UTF-7 Codec -------------------------------------------------------- */
4569
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570/* See RFC2152 for details. We encode conservatively and decode liberally. */
4571
4572/* Three simple macros defining base-64. */
4573
4574/* Is c a base-64 character? */
4575
4576#define IS_BASE64(c) \
4577 (((c) >= 'A' && (c) <= 'Z') || \
4578 ((c) >= 'a' && (c) <= 'z') || \
4579 ((c) >= '0' && (c) <= '9') || \
4580 (c) == '+' || (c) == '/')
4581
4582/* given that c is a base-64 character, what is its base-64 value? */
4583
4584#define FROM_BASE64(c) \
4585 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4586 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4587 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4588 (c) == '+' ? 62 : 63)
4589
4590/* What is the base-64 character of the bottom 6 bits of n? */
4591
4592#define TO_BASE64(n) \
4593 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4594
4595/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4596 * decoded as itself. We are permissive on decoding; the only ASCII
4597 * byte not decoding to itself is the + which begins a base64
4598 * string. */
4599
4600#define DECODE_DIRECT(c) \
4601 ((c) <= 127 && (c) != '+')
4602
4603/* The UTF-7 encoder treats ASCII characters differently according to
4604 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4605 * the above). See RFC2152. This array identifies these different
4606 * sets:
4607 * 0 : "Set D"
4608 * alphanumeric and '(),-./:?
4609 * 1 : "Set O"
4610 * !"#$%&*;<=>@[]^_`{|}
4611 * 2 : "whitespace"
4612 * ht nl cr sp
4613 * 3 : special (must be base64 encoded)
4614 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4615 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004616
Tim Petersced69f82003-09-16 20:30:58 +00004617static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618char utf7_category[128] = {
4619/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4620 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4621/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4622 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4623/* sp ! " # $ % & ' ( ) * + , - . / */
4624 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4625/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4627/* @ A B C D E F G H I J K L M N O */
4628 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4629/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4631/* ` a b c d e f g h i j k l m n o */
4632 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4633/* p q r s t u v w x y z { | } ~ del */
4634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635};
4636
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637/* ENCODE_DIRECT: this character should be encoded as itself. The
4638 * answer depends on whether we are encoding set O as itself, and also
4639 * on whether we are encoding whitespace as itself. RFC2152 makes it
4640 * clear that the answers to these questions vary between
4641 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004642
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643#define ENCODE_DIRECT(c, directO, directWS) \
4644 ((c) < 128 && (c) > 0 && \
4645 ((utf7_category[(c)] == 0) || \
4646 (directWS && (utf7_category[(c)] == 2)) || \
4647 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648
Alexander Belopolsky40018472011-02-26 01:02:56 +00004649PyObject *
4650PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004651 Py_ssize_t size,
4652 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004654 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4655}
4656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657/* The decoder. The only state we preserve is our read position,
4658 * i.e. how many characters we have consumed. So if we end in the
4659 * middle of a shift sequence we have to back off the read position
4660 * and the output to the beginning of the sequence, otherwise we lose
4661 * all the shift state (seen bits, number of bits seen, high
4662 * surrogate). */
4663
Alexander Belopolsky40018472011-02-26 01:02:56 +00004664PyObject *
4665PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004666 Py_ssize_t size,
4667 const char *errors,
4668 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004669{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004671 Py_ssize_t startinpos;
4672 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004674 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004675 const char *errmsg = "";
4676 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004677 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 unsigned int base64bits = 0;
4679 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004680 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004681 PyObject *errorHandler = NULL;
4682 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004684 if (size == 0) {
4685 if (consumed)
4686 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004687 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004691 _PyUnicodeWriter_Init(&writer);
4692 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004693
4694 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 e = s + size;
4696
4697 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004698 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004699 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004700 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 if (inShift) { /* in a base-64 section */
4703 if (IS_BASE64(ch)) { /* consume a base-64 character */
4704 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4705 base64bits += 6;
4706 s++;
4707 if (base64bits >= 16) {
4708 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004709 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 base64bits -= 16;
4711 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004712 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004713 if (surrogate) {
4714 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004715 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4716 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004717 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004718 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004720 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004721 }
4722 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004723 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004724 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 }
4727 }
Victor Stinner551ac952011-11-29 22:58:13 +01004728 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 /* first surrogate */
4730 surrogate = outCh;
4731 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004732 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004733 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004734 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 }
4736 }
4737 }
4738 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 if (base64bits > 0) { /* left-over bits */
4741 if (base64bits >= 6) {
4742 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004743 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744 errmsg = "partial character in shift sequence";
4745 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004747 else {
4748 /* Some bits remain; they should be zero */
4749 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004750 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 errmsg = "non-zero padding bits in shift sequence";
4752 goto utf7Error;
4753 }
4754 }
4755 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004756 if (surrogate && DECODE_DIRECT(ch)) {
4757 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4758 goto onError;
4759 }
4760 surrogate = 0;
4761 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 /* '-' is absorbed; other terminating
4763 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004764 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766 }
4767 }
4768 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004769 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004770 s++; /* consume '+' */
4771 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004773 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004774 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004776 else if (s < e && !IS_BASE64(*s)) {
4777 s++;
4778 errmsg = "ill-formed sequence";
4779 goto utf7Error;
4780 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004783 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004784 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004786 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004787 }
4788 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004789 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004790 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004791 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004792 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 else {
4795 startinpos = s-starts;
4796 s++;
4797 errmsg = "unexpected special character";
4798 goto utf7Error;
4799 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004800 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004803 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 errors, &errorHandler,
4805 "utf7", errmsg,
4806 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004807 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004809 }
4810
Antoine Pitrou244651a2009-05-04 18:56:13 +00004811 /* end of string */
4812
4813 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4814 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004815 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 if (surrogate ||
4817 (base64bits >= 6) ||
4818 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004820 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 errors, &errorHandler,
4822 "utf7", "unterminated shift sequence",
4823 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004824 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 goto onError;
4826 if (s < e)
4827 goto restart;
4828 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004829 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004830
4831 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004832 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004834 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004835 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004836 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004837 writer.kind, writer.data, shiftOutStart);
4838 Py_XDECREF(errorHandler);
4839 Py_XDECREF(exc);
4840 _PyUnicodeWriter_Dealloc(&writer);
4841 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004842 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004843 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004844 }
4845 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004846 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004847 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004848 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004849
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 Py_XDECREF(errorHandler);
4851 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004852 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004853
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 Py_XDECREF(errorHandler);
4856 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004857 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004858 return NULL;
4859}
4860
4861
Alexander Belopolsky40018472011-02-26 01:02:56 +00004862PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004863_PyUnicode_EncodeUTF7(PyObject *str,
4864 int base64SetO,
4865 int base64WhiteSpace,
4866 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004867{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004868 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004869 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004870 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004871 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004872 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004873 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004874 unsigned int base64bits = 0;
4875 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004876 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004877 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004878
Benjamin Petersonbac79492012-01-14 13:34:47 -05004879 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004880 return NULL;
4881 kind = PyUnicode_KIND(str);
4882 data = PyUnicode_DATA(str);
4883 len = PyUnicode_GET_LENGTH(str);
4884
4885 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004886 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004887
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004888 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004889 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004890 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004891 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004892 if (v == NULL)
4893 return NULL;
4894
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004895 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004896 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004897 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004898
Antoine Pitrou244651a2009-05-04 18:56:13 +00004899 if (inShift) {
4900 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4901 /* shifting out */
4902 if (base64bits) { /* output remaining bits */
4903 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4904 base64buffer = 0;
4905 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004906 }
4907 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004908 /* Characters not in the BASE64 set implicitly unshift the sequence
4909 so no '-' is required, except if the character is itself a '-' */
4910 if (IS_BASE64(ch) || ch == '-') {
4911 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004912 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004913 *out++ = (char) ch;
4914 }
4915 else {
4916 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004917 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004918 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004919 else { /* not in a shift sequence */
4920 if (ch == '+') {
4921 *out++ = '+';
4922 *out++ = '-';
4923 }
4924 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4925 *out++ = (char) ch;
4926 }
4927 else {
4928 *out++ = '+';
4929 inShift = 1;
4930 goto encode_char;
4931 }
4932 }
4933 continue;
4934encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004935 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004936 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004937
Antoine Pitrou244651a2009-05-04 18:56:13 +00004938 /* code first surrogate */
4939 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004940 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004941 while (base64bits >= 6) {
4942 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4943 base64bits -= 6;
4944 }
4945 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004946 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004947 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004948 base64bits += 16;
4949 base64buffer = (base64buffer << 16) | ch;
4950 while (base64bits >= 6) {
4951 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4952 base64bits -= 6;
4953 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004954 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004955 if (base64bits)
4956 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4957 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004958 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004959 if (_PyBytes_Resize(&v, out - start) < 0)
4960 return NULL;
4961 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004962}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004963PyObject *
4964PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4965 Py_ssize_t size,
4966 int base64SetO,
4967 int base64WhiteSpace,
4968 const char *errors)
4969{
4970 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004971 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004972 if (tmp == NULL)
4973 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004974 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004975 base64WhiteSpace, errors);
4976 Py_DECREF(tmp);
4977 return result;
4978}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004979
Antoine Pitrou244651a2009-05-04 18:56:13 +00004980#undef IS_BASE64
4981#undef FROM_BASE64
4982#undef TO_BASE64
4983#undef DECODE_DIRECT
4984#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004985
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986/* --- UTF-8 Codec -------------------------------------------------------- */
4987
Alexander Belopolsky40018472011-02-26 01:02:56 +00004988PyObject *
4989PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004990 Py_ssize_t size,
4991 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992{
Walter Dörwald69652032004-09-07 20:24:22 +00004993 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4994}
4995
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996#include "stringlib/asciilib.h"
4997#include "stringlib/codecs.h"
4998#include "stringlib/undef.h"
4999
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005000#include "stringlib/ucs1lib.h"
5001#include "stringlib/codecs.h"
5002#include "stringlib/undef.h"
5003
5004#include "stringlib/ucs2lib.h"
5005#include "stringlib/codecs.h"
5006#include "stringlib/undef.h"
5007
5008#include "stringlib/ucs4lib.h"
5009#include "stringlib/codecs.h"
5010#include "stringlib/undef.h"
5011
Antoine Pitrouab868312009-01-10 15:40:25 +00005012/* Mask to quickly check whether a C 'long' contains a
5013 non-ASCII, UTF8-encoded char. */
5014#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005015# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005016#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005017# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005018#else
5019# error C 'long' size should be either 4 or 8!
5020#endif
5021
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022static Py_ssize_t
5023ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005024{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005026 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005027
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005028 /*
5029 * Issue #17237: m68k is a bit different from most architectures in
5030 * that objects do not use "natural alignment" - for example, int and
5031 * long are only aligned at 2-byte boundaries. Therefore the assert()
5032 * won't work; also, tests have shown that skipping the "optimised
5033 * version" will even speed up m68k.
5034 */
5035#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005037 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
5038 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 /* Fast path, see in STRINGLIB(utf8_decode) for
5040 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005041 /* Help allocation */
5042 const char *_p = p;
5043 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005044 while (_p < aligned_end) {
5045 unsigned long value = *(const unsigned long *) _p;
5046 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005048 *((unsigned long *)q) = value;
5049 _p += SIZEOF_LONG;
5050 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005051 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005052 p = _p;
5053 while (p < end) {
5054 if ((unsigned char)*p & 0x80)
5055 break;
5056 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005058 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005061#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 while (p < end) {
5063 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5064 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005065 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005066 /* Help allocation */
5067 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06005069 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005070 if (value & ASCII_CHAR_MASK)
5071 break;
5072 _p += SIZEOF_LONG;
5073 }
5074 p = _p;
5075 if (_p == end)
5076 break;
5077 }
5078 if ((unsigned char)*p & 0x80)
5079 break;
5080 ++p;
5081 }
5082 memcpy(dest, start, p - start);
5083 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084}
Antoine Pitrouab868312009-01-10 15:40:25 +00005085
Victor Stinner709d23d2019-05-02 14:56:30 -04005086static PyObject *
5087unicode_decode_utf8(const char *s, Py_ssize_t size,
5088 _Py_error_handler error_handler, const char *errors,
5089 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005090{
Victor Stinner785938e2011-12-11 20:09:03 +01005091 if (size == 0) {
5092 if (consumed)
5093 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005094 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005095 }
5096
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5098 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005099 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005100 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005101 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005102 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005103 }
5104
Inada Naoki770847a2019-06-24 12:30:24 +09005105 const char *starts = s;
5106 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005107
Inada Naoki770847a2019-06-24 12:30:24 +09005108 // fast path: try ASCII string.
5109 PyObject *u = PyUnicode_New(size, 127);
5110 if (u == NULL) {
5111 return NULL;
5112 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005113 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005114 if (s == end) {
5115 return u;
5116 }
5117
5118 // Use _PyUnicodeWriter after fast path is failed.
5119 _PyUnicodeWriter writer;
5120 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5121 writer.pos = s - starts;
5122
5123 Py_ssize_t startinpos, endinpos;
5124 const char *errmsg = "";
5125 PyObject *error_handler_obj = NULL;
5126 PyObject *exc = NULL;
5127
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 while (s < end) {
5129 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005130 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005131
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005132 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005133 if (PyUnicode_IS_ASCII(writer.buffer))
5134 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005136 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005138 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005139 } else {
5140 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005141 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005142 }
5143
5144 switch (ch) {
5145 case 0:
5146 if (s == end || consumed)
5147 goto End;
5148 errmsg = "unexpected end of data";
5149 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005150 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 break;
5152 case 1:
5153 errmsg = "invalid start byte";
5154 startinpos = s - starts;
5155 endinpos = startinpos + 1;
5156 break;
5157 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005158 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5159 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5160 {
5161 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005162 goto End;
5163 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005164 /* fall through */
5165 case 3:
5166 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005167 errmsg = "invalid continuation byte";
5168 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005169 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005170 break;
5171 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005172 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005173 goto onError;
5174 continue;
5175 }
5176
Victor Stinner1d65d912015-10-05 13:43:50 +02005177 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005178 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005179
5180 switch (error_handler) {
5181 case _Py_ERROR_IGNORE:
5182 s += (endinpos - startinpos);
5183 break;
5184
5185 case _Py_ERROR_REPLACE:
5186 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5187 goto onError;
5188 s += (endinpos - startinpos);
5189 break;
5190
5191 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005192 {
5193 Py_ssize_t i;
5194
Victor Stinner1d65d912015-10-05 13:43:50 +02005195 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5196 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005197 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005198 ch = (Py_UCS4)(unsigned char)(starts[i]);
5199 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5200 ch + 0xdc00);
5201 writer.pos++;
5202 }
5203 s += (endinpos - startinpos);
5204 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005205 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005206
5207 default:
5208 if (unicode_decode_call_errorhandler_writer(
5209 errors, &error_handler_obj,
5210 "utf-8", errmsg,
5211 &starts, &end, &startinpos, &endinpos, &exc, &s,
5212 &writer))
5213 goto onError;
5214 }
Victor Stinner785938e2011-12-11 20:09:03 +01005215 }
5216
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005217End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005218 if (consumed)
5219 *consumed = s - starts;
5220
Victor Stinner1d65d912015-10-05 13:43:50 +02005221 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005222 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005223 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005224
5225onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005226 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005227 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005228 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005229 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005230}
5231
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005232
Victor Stinner709d23d2019-05-02 14:56:30 -04005233PyObject *
5234PyUnicode_DecodeUTF8Stateful(const char *s,
5235 Py_ssize_t size,
5236 const char *errors,
5237 Py_ssize_t *consumed)
5238{
5239 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5240}
5241
5242
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5244 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005245
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005246 On success, write a pointer to a newly allocated wide character string into
5247 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5248 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005249
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005250 On memory allocation failure, return -1.
5251
5252 On decoding error (if surrogateescape is zero), return -2. If wlen is
5253 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5254 is not NULL, write the decoding error message into *reason. */
5255int
5256_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005257 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005258{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005259 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005260 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005261 wchar_t *unicode;
5262 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005263
Victor Stinner3d4226a2018-08-29 22:21:32 +02005264 int surrogateescape = 0;
5265 int surrogatepass = 0;
5266 switch (errors)
5267 {
5268 case _Py_ERROR_STRICT:
5269 break;
5270 case _Py_ERROR_SURROGATEESCAPE:
5271 surrogateescape = 1;
5272 break;
5273 case _Py_ERROR_SURROGATEPASS:
5274 surrogatepass = 1;
5275 break;
5276 default:
5277 return -3;
5278 }
5279
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005280 /* Note: size will always be longer than the resulting Unicode
5281 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005282 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005283 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005284 }
5285
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005286 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005287 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005288 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005289 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005290
5291 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005292 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005293 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005294 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005295 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005296#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005297 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005298#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005299 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005300#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005301 if (ch > 0xFF) {
5302#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005303 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005304#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005305 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005306 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005307 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5308 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5309#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005310 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005311 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005312 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005313 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005314 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005315
5316 if (surrogateescape) {
5317 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5318 }
5319 else {
5320 /* Is it a valid three-byte code? */
5321 if (surrogatepass
5322 && (e - s) >= 3
5323 && (s[0] & 0xf0) == 0xe0
5324 && (s[1] & 0xc0) == 0x80
5325 && (s[2] & 0xc0) == 0x80)
5326 {
5327 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5328 s += 3;
5329 unicode[outpos++] = ch;
5330 }
5331 else {
5332 PyMem_RawFree(unicode );
5333 if (reason != NULL) {
5334 switch (ch) {
5335 case 0:
5336 *reason = "unexpected end of data";
5337 break;
5338 case 1:
5339 *reason = "invalid start byte";
5340 break;
5341 /* 2, 3, 4 */
5342 default:
5343 *reason = "invalid continuation byte";
5344 break;
5345 }
5346 }
5347 if (wlen != NULL) {
5348 *wlen = s - orig_s;
5349 }
5350 return -2;
5351 }
5352 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005353 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005354 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005355 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005356 if (wlen) {
5357 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005358 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005359 *wstr = unicode;
5360 return 0;
5361}
5362
Victor Stinner5f9cf232019-03-19 01:46:25 +01005363
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005364wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005365_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5366 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005367{
5368 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005369 int res = _Py_DecodeUTF8Ex(arg, arglen,
5370 &wstr, wlen,
5371 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005372 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005373 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5374 assert(res != -3);
5375 if (wlen) {
5376 *wlen = (size_t)res;
5377 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005378 return NULL;
5379 }
5380 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005381}
5382
Antoine Pitrouab868312009-01-10 15:40:25 +00005383
Victor Stinnere47e6982017-12-21 15:45:16 +01005384/* UTF-8 encoder using the surrogateescape error handler .
5385
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005386 On success, return 0 and write the newly allocated character string (use
5387 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005388
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005389 On encoding failure, return -2 and write the position of the invalid
5390 surrogate character into *error_pos (if error_pos is set) and the decoding
5391 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005392
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005393 On memory allocation failure, return -1. */
5394int
5395_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005396 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005397{
5398 const Py_ssize_t max_char_size = 4;
5399 Py_ssize_t len = wcslen(text);
5400
5401 assert(len >= 0);
5402
Victor Stinner3d4226a2018-08-29 22:21:32 +02005403 int surrogateescape = 0;
5404 int surrogatepass = 0;
5405 switch (errors)
5406 {
5407 case _Py_ERROR_STRICT:
5408 break;
5409 case _Py_ERROR_SURROGATEESCAPE:
5410 surrogateescape = 1;
5411 break;
5412 case _Py_ERROR_SURROGATEPASS:
5413 surrogatepass = 1;
5414 break;
5415 default:
5416 return -3;
5417 }
5418
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005419 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5420 return -1;
5421 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005422 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005423 if (raw_malloc) {
5424 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005425 }
5426 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005427 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005428 }
5429 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005430 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005431 }
5432
5433 char *p = bytes;
5434 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005435 for (i = 0; i < len; ) {
5436 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005437 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005438 i++;
5439#if Py_UNICODE_SIZE == 2
5440 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5441 && i < len
5442 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5443 {
5444 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5445 i++;
5446 }
5447#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005448
5449 if (ch < 0x80) {
5450 /* Encode ASCII */
5451 *p++ = (char) ch;
5452
5453 }
5454 else if (ch < 0x0800) {
5455 /* Encode Latin-1 */
5456 *p++ = (char)(0xc0 | (ch >> 6));
5457 *p++ = (char)(0x80 | (ch & 0x3f));
5458 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005459 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005460 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005461 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005462 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005463 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005464 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005465 if (reason != NULL) {
5466 *reason = "encoding error";
5467 }
5468 if (raw_malloc) {
5469 PyMem_RawFree(bytes);
5470 }
5471 else {
5472 PyMem_Free(bytes);
5473 }
5474 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005475 }
5476 *p++ = (char)(ch & 0xff);
5477 }
5478 else if (ch < 0x10000) {
5479 *p++ = (char)(0xe0 | (ch >> 12));
5480 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5481 *p++ = (char)(0x80 | (ch & 0x3f));
5482 }
5483 else { /* ch >= 0x10000 */
5484 assert(ch <= MAX_UNICODE);
5485 /* Encode UCS4 Unicode ordinals */
5486 *p++ = (char)(0xf0 | (ch >> 18));
5487 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5488 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5489 *p++ = (char)(0x80 | (ch & 0x3f));
5490 }
5491 }
5492 *p++ = '\0';
5493
5494 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005495 char *bytes2;
5496 if (raw_malloc) {
5497 bytes2 = PyMem_RawRealloc(bytes, final_size);
5498 }
5499 else {
5500 bytes2 = PyMem_Realloc(bytes, final_size);
5501 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005502 if (bytes2 == NULL) {
5503 if (error_pos != NULL) {
5504 *error_pos = (size_t)-1;
5505 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005506 if (raw_malloc) {
5507 PyMem_RawFree(bytes);
5508 }
5509 else {
5510 PyMem_Free(bytes);
5511 }
5512 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005513 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005514 *str = bytes2;
5515 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005516}
5517
5518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519/* Primary internal function which creates utf8 encoded bytes objects.
5520
5521 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005522 and allocate exactly as much space needed at the end. Else allocate the
5523 maximum possible needed (4 result bytes per Unicode character), and return
5524 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005525*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005526static PyObject *
5527unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5528 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 if (!PyUnicode_Check(unicode)) {
5531 PyErr_BadArgument();
5532 return NULL;
5533 }
5534
5535 if (PyUnicode_READY(unicode) == -1)
5536 return NULL;
5537
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005538 if (PyUnicode_UTF8(unicode))
5539 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5540 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005541
Inada Naoki02a4d572020-02-27 13:48:59 +09005542 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005543 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005544 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5545
5546 _PyBytesWriter writer;
5547 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548
Benjamin Petersonead6b532011-12-20 17:23:42 -06005549 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005550 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005551 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005552 case PyUnicode_1BYTE_KIND:
5553 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5554 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005555 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5556 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005557 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005558 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5559 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005560 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005561 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5562 break;
Tim Peters602f7402002-04-27 18:03:26 +00005563 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005564
5565 if (end == NULL) {
5566 _PyBytesWriter_Dealloc(&writer);
5567 return NULL;
5568 }
5569 return _PyBytesWriter_Finish(&writer, end);
5570}
5571
5572static int
5573unicode_fill_utf8(PyObject *unicode)
5574{
5575 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5576 assert(!PyUnicode_IS_ASCII(unicode));
5577
5578 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005579 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005580 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5581
5582 _PyBytesWriter writer;
5583 char *end;
5584
5585 switch (kind) {
5586 default:
5587 Py_UNREACHABLE();
5588 case PyUnicode_1BYTE_KIND:
5589 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5590 _Py_ERROR_STRICT, NULL);
5591 break;
5592 case PyUnicode_2BYTE_KIND:
5593 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5594 _Py_ERROR_STRICT, NULL);
5595 break;
5596 case PyUnicode_4BYTE_KIND:
5597 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5598 _Py_ERROR_STRICT, NULL);
5599 break;
5600 }
5601 if (end == NULL) {
5602 _PyBytesWriter_Dealloc(&writer);
5603 return -1;
5604 }
5605
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005606 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005607 PyBytes_AS_STRING(writer.buffer);
5608 Py_ssize_t len = end - start;
5609
5610 char *cache = PyObject_MALLOC(len + 1);
5611 if (cache == NULL) {
5612 _PyBytesWriter_Dealloc(&writer);
5613 PyErr_NoMemory();
5614 return -1;
5615 }
5616 _PyUnicode_UTF8(unicode) = cache;
5617 _PyUnicode_UTF8_LENGTH(unicode) = len;
5618 memcpy(cache, start, len);
5619 cache[len] = '\0';
5620 _PyBytesWriter_Dealloc(&writer);
5621 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622}
5623
Alexander Belopolsky40018472011-02-26 01:02:56 +00005624PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005625_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5626{
5627 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5628}
5629
5630
5631PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005632PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5633 Py_ssize_t size,
5634 const char *errors)
5635{
5636 PyObject *v, *unicode;
5637
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005638 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639 if (unicode == NULL)
5640 return NULL;
5641 v = _PyUnicode_AsUTF8String(unicode, errors);
5642 Py_DECREF(unicode);
5643 return v;
5644}
5645
5646PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005647PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650}
5651
Walter Dörwald41980ca2007-08-16 21:55:45 +00005652/* --- UTF-32 Codec ------------------------------------------------------- */
5653
5654PyObject *
5655PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 Py_ssize_t size,
5657 const char *errors,
5658 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005659{
5660 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5661}
5662
5663PyObject *
5664PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 Py_ssize_t size,
5666 const char *errors,
5667 int *byteorder,
5668 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005669{
5670 const char *starts = s;
5671 Py_ssize_t startinpos;
5672 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005673 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005674 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005675 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005676 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005677 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005678 PyObject *errorHandler = NULL;
5679 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005680
Andy Lestere6be9b52020-02-11 20:28:35 -06005681 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005682 e = q + size;
5683
5684 if (byteorder)
5685 bo = *byteorder;
5686
5687 /* Check for BOM marks (U+FEFF) in the input and adjust current
5688 byte order setting accordingly. In native mode, the leading BOM
5689 mark is skipped, in all other modes, it is copied to the output
5690 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005691 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005692 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005693 if (bom == 0x0000FEFF) {
5694 bo = -1;
5695 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005697 else if (bom == 0xFFFE0000) {
5698 bo = 1;
5699 q += 4;
5700 }
5701 if (byteorder)
5702 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005703 }
5704
Victor Stinnere64322e2012-10-30 23:12:47 +01005705 if (q == e) {
5706 if (consumed)
5707 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005708 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005709 }
5710
Victor Stinnere64322e2012-10-30 23:12:47 +01005711#ifdef WORDS_BIGENDIAN
5712 le = bo < 0;
5713#else
5714 le = bo <= 0;
5715#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005716 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005717
Victor Stinner8f674cc2013-04-17 23:02:17 +02005718 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005719 writer.min_length = (e - q + 3) / 4;
5720 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005721 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005722
Victor Stinnere64322e2012-10-30 23:12:47 +01005723 while (1) {
5724 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005725 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005726
Victor Stinnere64322e2012-10-30 23:12:47 +01005727 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005728 enum PyUnicode_Kind kind = writer.kind;
5729 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005730 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005731 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005732 if (le) {
5733 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005734 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005735 if (ch > maxch)
5736 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005737 if (kind != PyUnicode_1BYTE_KIND &&
5738 Py_UNICODE_IS_SURROGATE(ch))
5739 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005740 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005741 q += 4;
5742 } while (q <= last);
5743 }
5744 else {
5745 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005746 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005747 if (ch > maxch)
5748 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005749 if (kind != PyUnicode_1BYTE_KIND &&
5750 Py_UNICODE_IS_SURROGATE(ch))
5751 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005752 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005753 q += 4;
5754 } while (q <= last);
5755 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005756 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005757 }
5758
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005760 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 startinpos = ((const char *)q) - starts;
5762 endinpos = startinpos + 4;
5763 }
5764 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005765 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005767 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005769 startinpos = ((const char *)q) - starts;
5770 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005772 else {
5773 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005774 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005775 goto onError;
5776 q += 4;
5777 continue;
5778 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005779 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005780 startinpos = ((const char *)q) - starts;
5781 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005783
5784 /* The remaining input chars are ignored if the callback
5785 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005786 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005788 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005790 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005792 }
5793
Walter Dörwald41980ca2007-08-16 21:55:45 +00005794 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005796
Walter Dörwald41980ca2007-08-16 21:55:45 +00005797 Py_XDECREF(errorHandler);
5798 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005799 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005803 Py_XDECREF(errorHandler);
5804 Py_XDECREF(exc);
5805 return NULL;
5806}
5807
5808PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005809_PyUnicode_EncodeUTF32(PyObject *str,
5810 const char *errors,
5811 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005812{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005813 enum PyUnicode_Kind kind;
5814 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005815 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005817 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005818#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005819 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005820#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005821 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005822#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005823 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005824 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005825 PyObject *errorHandler = NULL;
5826 PyObject *exc = NULL;
5827 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005828
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005829 if (!PyUnicode_Check(str)) {
5830 PyErr_BadArgument();
5831 return NULL;
5832 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005833 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005834 return NULL;
5835 kind = PyUnicode_KIND(str);
5836 data = PyUnicode_DATA(str);
5837 len = PyUnicode_GET_LENGTH(str);
5838
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005839 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005840 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005841 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005842 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005843 if (v == NULL)
5844 return NULL;
5845
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005846 /* output buffer is 4-bytes aligned */
5847 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005848 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005849 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005850 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005851 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005852 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005853
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005854 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005856 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005857 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005858 else
5859 encoding = "utf-32";
5860
5861 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005862 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5863 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005864 }
5865
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005866 pos = 0;
5867 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005868 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005869
5870 if (kind == PyUnicode_2BYTE_KIND) {
5871 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5872 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005873 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005874 else {
5875 assert(kind == PyUnicode_4BYTE_KIND);
5876 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5877 &out, native_ordering);
5878 }
5879 if (pos == len)
5880 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005881
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005882 rep = unicode_encode_call_errorhandler(
5883 errors, &errorHandler,
5884 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005885 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005886 if (!rep)
5887 goto error;
5888
5889 if (PyBytes_Check(rep)) {
5890 repsize = PyBytes_GET_SIZE(rep);
5891 if (repsize & 3) {
5892 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005893 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005894 "surrogates not allowed");
5895 goto error;
5896 }
5897 moreunits = repsize / 4;
5898 }
5899 else {
5900 assert(PyUnicode_Check(rep));
5901 if (PyUnicode_READY(rep) < 0)
5902 goto error;
5903 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5904 if (!PyUnicode_IS_ASCII(rep)) {
5905 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005906 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005907 "surrogates not allowed");
5908 goto error;
5909 }
5910 }
5911
5912 /* four bytes are reserved for each surrogate */
5913 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005914 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005915 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005916 /* integer overflow */
5917 PyErr_NoMemory();
5918 goto error;
5919 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005920 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005921 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005922 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005923 }
5924
5925 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005926 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005927 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005928 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005929 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005930 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5931 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005932 }
5933
5934 Py_CLEAR(rep);
5935 }
5936
5937 /* Cut back to size actually needed. This is necessary for, for example,
5938 encoding of a string containing isolated surrogates and the 'ignore'
5939 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005940 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005941 if (nsize != PyBytes_GET_SIZE(v))
5942 _PyBytes_Resize(&v, nsize);
5943 Py_XDECREF(errorHandler);
5944 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005945 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005947 error:
5948 Py_XDECREF(rep);
5949 Py_XDECREF(errorHandler);
5950 Py_XDECREF(exc);
5951 Py_XDECREF(v);
5952 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005953}
5954
Alexander Belopolsky40018472011-02-26 01:02:56 +00005955PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5957 Py_ssize_t size,
5958 const char *errors,
5959 int byteorder)
5960{
5961 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005962 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 if (tmp == NULL)
5964 return NULL;
5965 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5966 Py_DECREF(tmp);
5967 return result;
5968}
5969
5970PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005971PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005972{
Victor Stinnerb960b342011-11-20 19:12:52 +01005973 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005974}
5975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976/* --- UTF-16 Codec ------------------------------------------------------- */
5977
Tim Peters772747b2001-08-09 22:21:55 +00005978PyObject *
5979PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 Py_ssize_t size,
5981 const char *errors,
5982 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983{
Walter Dörwald69652032004-09-07 20:24:22 +00005984 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5985}
5986
5987PyObject *
5988PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 Py_ssize_t size,
5990 const char *errors,
5991 int *byteorder,
5992 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005993{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005995 Py_ssize_t startinpos;
5996 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005997 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005998 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005999 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006000 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006001 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 PyObject *errorHandler = NULL;
6003 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006004 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005
Andy Lestere6be9b52020-02-11 20:28:35 -06006006 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006007 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008
6009 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006010 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006012 /* Check for BOM marks (U+FEFF) in the input and adjust current
6013 byte order setting accordingly. In native mode, the leading BOM
6014 mark is skipped, in all other modes, it is copied to the output
6015 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006016 if (bo == 0 && size >= 2) {
6017 const Py_UCS4 bom = (q[1] << 8) | q[0];
6018 if (bom == 0xFEFF) {
6019 q += 2;
6020 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006022 else if (bom == 0xFFFE) {
6023 q += 2;
6024 bo = 1;
6025 }
6026 if (byteorder)
6027 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Antoine Pitrou63065d72012-05-15 23:48:04 +02006030 if (q == e) {
6031 if (consumed)
6032 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006033 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006034 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006035
Christian Heimes743e0cd2012-10-17 23:52:17 +02006036#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006037 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006038 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006039#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006040 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006041 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006042#endif
Tim Peters772747b2001-08-09 22:21:55 +00006043
Antoine Pitrou63065d72012-05-15 23:48:04 +02006044 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006045 character count normally. Error handler will take care of
6046 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006047 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006048 writer.min_length = (e - q + 1) / 2;
6049 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006050 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006051
Antoine Pitrou63065d72012-05-15 23:48:04 +02006052 while (1) {
6053 Py_UCS4 ch = 0;
6054 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006056 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006057 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006058 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006060 native_ordering);
6061 else
6062 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006063 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006064 native_ordering);
6065 } else if (kind == PyUnicode_2BYTE_KIND) {
6066 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006067 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006068 native_ordering);
6069 } else {
6070 assert(kind == PyUnicode_4BYTE_KIND);
6071 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006072 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006073 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006074 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 switch (ch)
6078 {
6079 case 0:
6080 /* remaining byte at the end? (size should be even) */
6081 if (q == e || consumed)
6082 goto End;
6083 errmsg = "truncated data";
6084 startinpos = ((const char *)q) - starts;
6085 endinpos = ((const char *)e) - starts;
6086 break;
6087 /* The remaining input chars are ignored if the callback
6088 chooses to skip the input */
6089 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006090 q -= 2;
6091 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006092 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006094 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006095 endinpos = ((const char *)e) - starts;
6096 break;
6097 case 2:
6098 errmsg = "illegal encoding";
6099 startinpos = ((const char *)q) - 2 - starts;
6100 endinpos = startinpos + 2;
6101 break;
6102 case 3:
6103 errmsg = "illegal UTF-16 surrogate";
6104 startinpos = ((const char *)q) - 4 - starts;
6105 endinpos = startinpos + 2;
6106 break;
6107 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006108 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006109 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 continue;
6111 }
6112
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006113 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006114 errors,
6115 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006116 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006117 &starts,
6118 (const char **)&e,
6119 &startinpos,
6120 &endinpos,
6121 &exc,
6122 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006123 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 }
6126
Antoine Pitrou63065d72012-05-15 23:48:04 +02006127End:
Walter Dörwald69652032004-09-07 20:24:22 +00006128 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006130
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 Py_XDECREF(errorHandler);
6132 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006133 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006136 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 Py_XDECREF(errorHandler);
6138 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 return NULL;
6140}
6141
Tim Peters772747b2001-08-09 22:21:55 +00006142PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143_PyUnicode_EncodeUTF16(PyObject *str,
6144 const char *errors,
6145 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006147 enum PyUnicode_Kind kind;
6148 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006150 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006151 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006152 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006153#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006154 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006155#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006156 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006157#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006158 const char *encoding;
6159 Py_ssize_t nsize, pos;
6160 PyObject *errorHandler = NULL;
6161 PyObject *exc = NULL;
6162 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (!PyUnicode_Check(str)) {
6165 PyErr_BadArgument();
6166 return NULL;
6167 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006168 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 return NULL;
6170 kind = PyUnicode_KIND(str);
6171 data = PyUnicode_DATA(str);
6172 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006175 if (kind == PyUnicode_4BYTE_KIND) {
6176 const Py_UCS4 *in = (const Py_UCS4 *)data;
6177 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006178 while (in < end) {
6179 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006181 }
6182 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006183 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006184 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006186 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006187 nsize = len + pairs + (byteorder == 0);
6188 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006189 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006193 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006194 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006195 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006196 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006197 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006198 }
6199 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006200 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006201 }
Tim Peters772747b2001-08-09 22:21:55 +00006202
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006203 if (kind == PyUnicode_1BYTE_KIND) {
6204 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6205 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006206 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006207
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006208 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006209 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006210 }
6211 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006212 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006213 }
6214 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006215 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006216 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006217
6218 pos = 0;
6219 while (pos < len) {
6220 Py_ssize_t repsize, moreunits;
6221
6222 if (kind == PyUnicode_2BYTE_KIND) {
6223 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6224 &out, native_ordering);
6225 }
6226 else {
6227 assert(kind == PyUnicode_4BYTE_KIND);
6228 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6229 &out, native_ordering);
6230 }
6231 if (pos == len)
6232 break;
6233
6234 rep = unicode_encode_call_errorhandler(
6235 errors, &errorHandler,
6236 encoding, "surrogates not allowed",
6237 str, &exc, pos, pos + 1, &pos);
6238 if (!rep)
6239 goto error;
6240
6241 if (PyBytes_Check(rep)) {
6242 repsize = PyBytes_GET_SIZE(rep);
6243 if (repsize & 1) {
6244 raise_encode_exception(&exc, encoding,
6245 str, pos - 1, pos,
6246 "surrogates not allowed");
6247 goto error;
6248 }
6249 moreunits = repsize / 2;
6250 }
6251 else {
6252 assert(PyUnicode_Check(rep));
6253 if (PyUnicode_READY(rep) < 0)
6254 goto error;
6255 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6256 if (!PyUnicode_IS_ASCII(rep)) {
6257 raise_encode_exception(&exc, encoding,
6258 str, pos - 1, pos,
6259 "surrogates not allowed");
6260 goto error;
6261 }
6262 }
6263
6264 /* two bytes are reserved for each surrogate */
6265 if (moreunits > 1) {
6266 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006267 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006268 /* integer overflow */
6269 PyErr_NoMemory();
6270 goto error;
6271 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006272 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006273 goto error;
6274 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6275 }
6276
6277 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006278 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006279 out += moreunits;
6280 } else /* rep is unicode */ {
6281 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6282 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6283 &out, native_ordering);
6284 }
6285
6286 Py_CLEAR(rep);
6287 }
6288
6289 /* Cut back to size actually needed. This is necessary for, for example,
6290 encoding of a string containing isolated surrogates and the 'ignore' handler
6291 is used. */
6292 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6293 if (nsize != PyBytes_GET_SIZE(v))
6294 _PyBytes_Resize(&v, nsize);
6295 Py_XDECREF(errorHandler);
6296 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006297 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006298 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006299 error:
6300 Py_XDECREF(rep);
6301 Py_XDECREF(errorHandler);
6302 Py_XDECREF(exc);
6303 Py_XDECREF(v);
6304 return NULL;
6305#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306}
6307
Alexander Belopolsky40018472011-02-26 01:02:56 +00006308PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006309PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6310 Py_ssize_t size,
6311 const char *errors,
6312 int byteorder)
6313{
6314 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006315 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006316 if (tmp == NULL)
6317 return NULL;
6318 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6319 Py_DECREF(tmp);
6320 return result;
6321}
6322
6323PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006324PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006326 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327}
6328
6329/* --- Unicode Escape Codec ----------------------------------------------- */
6330
Fredrik Lundh06d12682001-01-24 07:59:11 +00006331static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006332
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006334_PyUnicode_DecodeUnicodeEscape(const char *s,
6335 Py_ssize_t size,
6336 const char *errors,
6337 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006340 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 PyObject *errorHandler = NULL;
6343 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006344
Eric V. Smith42454af2016-10-31 09:22:08 -04006345 // so we can remember if we've seen an invalid escape char or not
6346 *first_invalid_escape = NULL;
6347
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006349 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 }
6351 /* Escaped strings will always be longer than the resulting
6352 Unicode string, so we start with size here and then reduce the
6353 length after conversion to the true value.
6354 (but if the error callback returns a long replacement string
6355 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006356 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 writer.min_length = size;
6358 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6359 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006360 }
6361
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 end = s + size;
6363 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006364 unsigned char c = (unsigned char) *s++;
6365 Py_UCS4 ch;
6366 int count;
6367 Py_ssize_t startinpos;
6368 Py_ssize_t endinpos;
6369 const char *message;
6370
6371#define WRITE_ASCII_CHAR(ch) \
6372 do { \
6373 assert(ch <= 127); \
6374 assert(writer.pos < writer.size); \
6375 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6376 } while(0)
6377
6378#define WRITE_CHAR(ch) \
6379 do { \
6380 if (ch <= writer.maxchar) { \
6381 assert(writer.pos < writer.size); \
6382 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6383 } \
6384 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6385 goto onError; \
6386 } \
6387 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
6389 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006390 if (c != '\\') {
6391 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006392 continue;
6393 }
6394
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 if (s >= end) {
6398 message = "\\ at end of string";
6399 goto error;
6400 }
6401 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006402
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006404 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 case '\n': continue;
6408 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6409 case '\'': WRITE_ASCII_CHAR('\''); continue;
6410 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6411 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006412 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6414 case 't': WRITE_ASCII_CHAR('\t'); continue;
6415 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6416 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006417 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006419 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 case '0': case '1': case '2': case '3':
6424 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006426 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 ch = (ch<<3) + *s++ - '0';
6428 if (s < end && '0' <= *s && *s <= '7') {
6429 ch = (ch<<3) + *s++ - '0';
6430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 WRITE_CHAR(ch);
6433 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 /* hex escapes */
6436 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006439 message = "truncated \\xXX escape";
6440 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006444 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006445 message = "truncated \\uXXXX escape";
6446 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006449 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006451 message = "truncated \\UXXXXXXXX escape";
6452 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006454 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006455 ch <<= 4;
6456 if (c >= '0' && c <= '9') {
6457 ch += c - '0';
6458 }
6459 else if (c >= 'a' && c <= 'f') {
6460 ch += c - ('a' - 10);
6461 }
6462 else if (c >= 'A' && c <= 'F') {
6463 ch += c - ('A' - 10);
6464 }
6465 else {
6466 break;
6467 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006468 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006470 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 }
6472
6473 /* when we get here, ch is a 32-bit unicode character */
6474 if (ch > MAX_UNICODE) {
6475 message = "illegal Unicode character";
6476 goto error;
6477 }
6478
6479 WRITE_CHAR(ch);
6480 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006483 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006484 if (ucnhash_CAPI == NULL) {
6485 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006486 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6487 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006488 if (ucnhash_CAPI == NULL) {
6489 PyErr_SetString(
6490 PyExc_UnicodeError,
6491 "\\N escapes not supported (can't load unicodedata module)"
6492 );
6493 goto onError;
6494 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006495 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006496
6497 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006498 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 const char *start = ++s;
6500 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006501 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006502 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006503 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 namelen = s - start;
6505 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006506 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006507 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 ch = 0xffffffff; /* in case 'getcode' messes up */
6509 if (namelen <= INT_MAX &&
6510 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6511 &ch, 0)) {
6512 assert(ch <= MAX_UNICODE);
6513 WRITE_CHAR(ch);
6514 continue;
6515 }
6516 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006517 }
6518 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006519 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006520
6521 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006522 if (*first_invalid_escape == NULL) {
6523 *first_invalid_escape = s-1; /* Back up one char, since we've
6524 already incremented s. */
6525 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006526 WRITE_ASCII_CHAR('\\');
6527 WRITE_CHAR(c);
6528 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006530
6531 error:
6532 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006533 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006534 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006535 errors, &errorHandler,
6536 "unicodeescape", message,
6537 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006538 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006539 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006540 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006541 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006542
6543#undef WRITE_ASCII_CHAR
6544#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006546
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006547 Py_XDECREF(errorHandler);
6548 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006549 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006550
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006552 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006553 Py_XDECREF(errorHandler);
6554 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 return NULL;
6556}
6557
Eric V. Smith42454af2016-10-31 09:22:08 -04006558PyObject *
6559PyUnicode_DecodeUnicodeEscape(const char *s,
6560 Py_ssize_t size,
6561 const char *errors)
6562{
6563 const char *first_invalid_escape;
6564 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6565 &first_invalid_escape);
6566 if (result == NULL)
6567 return NULL;
6568 if (first_invalid_escape != NULL) {
6569 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6570 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006571 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006572 Py_DECREF(result);
6573 return NULL;
6574 }
6575 }
6576 return result;
6577}
6578
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006579/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580
Alexander Belopolsky40018472011-02-26 01:02:56 +00006581PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006582PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006584 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006587 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006588 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006589 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590
Ezio Melottie7f90372012-10-05 03:33:31 +03006591 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006592 escape.
6593
Ezio Melottie7f90372012-10-05 03:33:31 +03006594 For UCS1 strings it's '\xxx', 4 bytes per source character.
6595 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6596 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006597 */
6598
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006599 if (!PyUnicode_Check(unicode)) {
6600 PyErr_BadArgument();
6601 return NULL;
6602 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006603 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006604 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006605 }
Victor Stinner358af132015-10-12 22:36:57 +02006606
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006607 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006608 if (len == 0) {
6609 return PyBytes_FromStringAndSize(NULL, 0);
6610 }
6611
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006612 kind = PyUnicode_KIND(unicode);
6613 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006614 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6615 bytes, and 1 byte characters 4. */
6616 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006617 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006618 return PyErr_NoMemory();
6619 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006620 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006621 if (repr == NULL) {
6622 return NULL;
6623 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006624
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006626 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006627 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006628
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 /* U+0000-U+00ff range */
6630 if (ch < 0x100) {
6631 if (ch >= ' ' && ch < 127) {
6632 if (ch != '\\') {
6633 /* Copy printable US ASCII as-is */
6634 *p++ = (char) ch;
6635 }
6636 /* Escape backslashes */
6637 else {
6638 *p++ = '\\';
6639 *p++ = '\\';
6640 }
6641 }
Victor Stinner358af132015-10-12 22:36:57 +02006642
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 /* Map special whitespace to '\t', \n', '\r' */
6644 else if (ch == '\t') {
6645 *p++ = '\\';
6646 *p++ = 't';
6647 }
6648 else if (ch == '\n') {
6649 *p++ = '\\';
6650 *p++ = 'n';
6651 }
6652 else if (ch == '\r') {
6653 *p++ = '\\';
6654 *p++ = 'r';
6655 }
6656
6657 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6658 else {
6659 *p++ = '\\';
6660 *p++ = 'x';
6661 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6662 *p++ = Py_hexdigits[ch & 0x000F];
6663 }
Tim Petersced69f82003-09-16 20:30:58 +00006664 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006665 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006666 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 *p++ = '\\';
6668 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006669 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6670 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6671 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6672 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006674 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6675 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006676
Victor Stinner62ec3312016-09-06 17:04:34 -07006677 /* Make sure that the first two digits are zero */
6678 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006679 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 *p++ = 'U';
6681 *p++ = '0';
6682 *p++ = '0';
6683 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6684 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6685 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6686 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6687 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6688 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691
Victor Stinner62ec3312016-09-06 17:04:34 -07006692 assert(p - PyBytes_AS_STRING(repr) > 0);
6693 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6694 return NULL;
6695 }
6696 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697}
6698
Alexander Belopolsky40018472011-02-26 01:02:56 +00006699PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006700PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6701 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006703 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006704 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006705 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006707 }
6708
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006709 result = PyUnicode_AsUnicodeEscapeString(tmp);
6710 Py_DECREF(tmp);
6711 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712}
6713
6714/* --- Raw Unicode Escape Codec ------------------------------------------- */
6715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716PyObject *
6717PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006718 Py_ssize_t size,
6719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006722 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 PyObject *errorHandler = NULL;
6725 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006726
Victor Stinner62ec3312016-09-06 17:04:34 -07006727 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006728 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006729 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006730
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 /* Escaped strings will always be longer than the resulting
6732 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 length after conversion to the true value. (But decoding error
6734 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006735 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006736 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006737 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6738 goto onError;
6739 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006740
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 end = s + size;
6742 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006743 unsigned char c = (unsigned char) *s++;
6744 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006745 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006746 Py_ssize_t startinpos;
6747 Py_ssize_t endinpos;
6748 const char *message;
6749
6750#define WRITE_CHAR(ch) \
6751 do { \
6752 if (ch <= writer.maxchar) { \
6753 assert(writer.pos < writer.size); \
6754 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6755 } \
6756 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6757 goto onError; \
6758 } \
6759 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006762 if (c != '\\' || s >= end) {
6763 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006765 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006766
Victor Stinner62ec3312016-09-06 17:04:34 -07006767 c = (unsigned char) *s++;
6768 if (c == 'u') {
6769 count = 4;
6770 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006772 else if (c == 'U') {
6773 count = 8;
6774 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006775 }
6776 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006777 assert(writer.pos < writer.size);
6778 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6779 WRITE_CHAR(c);
6780 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006781 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006782 startinpos = s - starts - 2;
6783
6784 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6785 for (ch = 0; count && s < end; ++s, --count) {
6786 c = (unsigned char)*s;
6787 ch <<= 4;
6788 if (c >= '0' && c <= '9') {
6789 ch += c - '0';
6790 }
6791 else if (c >= 'a' && c <= 'f') {
6792 ch += c - ('a' - 10);
6793 }
6794 else if (c >= 'A' && c <= 'F') {
6795 ch += c - ('A' - 10);
6796 }
6797 else {
6798 break;
6799 }
6800 }
6801 if (!count) {
6802 if (ch <= MAX_UNICODE) {
6803 WRITE_CHAR(ch);
6804 continue;
6805 }
6806 message = "\\Uxxxxxxxx out of range";
6807 }
6808
6809 endinpos = s-starts;
6810 writer.min_length = end - s + writer.pos;
6811 if (unicode_decode_call_errorhandler_writer(
6812 errors, &errorHandler,
6813 "rawunicodeescape", message,
6814 &starts, &end, &startinpos, &endinpos, &exc, &s,
6815 &writer)) {
6816 goto onError;
6817 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006818 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006819
6820#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 Py_XDECREF(errorHandler);
6823 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006824 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006825
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006827 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828 Py_XDECREF(errorHandler);
6829 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006831
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832}
6833
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006834
Alexander Belopolsky40018472011-02-26 01:02:56 +00006835PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006836PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837{
Victor Stinner62ec3312016-09-06 17:04:34 -07006838 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006840 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006841 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006842 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006843 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006845 if (!PyUnicode_Check(unicode)) {
6846 PyErr_BadArgument();
6847 return NULL;
6848 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006849 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006850 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006851 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006852 kind = PyUnicode_KIND(unicode);
6853 data = PyUnicode_DATA(unicode);
6854 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006855 if (kind == PyUnicode_1BYTE_KIND) {
6856 return PyBytes_FromStringAndSize(data, len);
6857 }
Victor Stinner0e368262011-11-10 20:12:49 +01006858
Victor Stinner62ec3312016-09-06 17:04:34 -07006859 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6860 bytes, and 1 byte characters 4. */
6861 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006862
Victor Stinner62ec3312016-09-06 17:04:34 -07006863 if (len > PY_SSIZE_T_MAX / expandsize) {
6864 return PyErr_NoMemory();
6865 }
6866 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6867 if (repr == NULL) {
6868 return NULL;
6869 }
6870 if (len == 0) {
6871 return repr;
6872 }
6873
6874 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006875 for (pos = 0; pos < len; pos++) {
6876 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006877
Victor Stinner62ec3312016-09-06 17:04:34 -07006878 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6879 if (ch < 0x100) {
6880 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006881 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006882 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006883 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 *p++ = '\\';
6885 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006886 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6887 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6888 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6889 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006891 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6892 else {
6893 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6894 *p++ = '\\';
6895 *p++ = 'U';
6896 *p++ = '0';
6897 *p++ = '0';
6898 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6899 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6900 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6901 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6902 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6903 *p++ = Py_hexdigits[ch & 15];
6904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006906
Victor Stinner62ec3312016-09-06 17:04:34 -07006907 assert(p > PyBytes_AS_STRING(repr));
6908 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6909 return NULL;
6910 }
6911 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912}
6913
Alexander Belopolsky40018472011-02-26 01:02:56 +00006914PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006915PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6916 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006918 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006919 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006920 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006921 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006922 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6923 Py_DECREF(tmp);
6924 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
6927/* --- Latin-1 Codec ------------------------------------------------------ */
6928
Alexander Belopolsky40018472011-02-26 01:02:56 +00006929PyObject *
6930PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006931 Py_ssize_t size,
6932 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006935 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936}
6937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006939static void
6940make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006941 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006942 PyObject *unicode,
6943 Py_ssize_t startpos, Py_ssize_t endpos,
6944 const char *reason)
6945{
6946 if (*exceptionObject == NULL) {
6947 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006948 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006949 encoding, unicode, startpos, endpos, reason);
6950 }
6951 else {
6952 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6953 goto onError;
6954 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6955 goto onError;
6956 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6957 goto onError;
6958 return;
6959 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006960 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006961 }
6962}
6963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006964/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006965static void
6966raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006967 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006968 PyObject *unicode,
6969 Py_ssize_t startpos, Py_ssize_t endpos,
6970 const char *reason)
6971{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006972 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006973 encoding, unicode, startpos, endpos, reason);
6974 if (*exceptionObject != NULL)
6975 PyCodec_StrictErrors(*exceptionObject);
6976}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977
6978/* error handling callback helper:
6979 build arguments, call the callback and check the arguments,
6980 put the result into newpos and return the replacement string, which
6981 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006982static PyObject *
6983unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006984 PyObject **errorHandler,
6985 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006986 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006987 Py_ssize_t startpos, Py_ssize_t endpos,
6988 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006989{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006990 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006991 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006992 PyObject *restuple;
6993 PyObject *resunicode;
6994
6995 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006997 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006999 }
7000
Benjamin Petersonbac79492012-01-14 13:34:47 -05007001 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007002 return NULL;
7003 len = PyUnicode_GET_LENGTH(unicode);
7004
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007005 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007006 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007007 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007009
Petr Viktorinffd97532020-02-11 17:46:57 +01007010 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007011 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007014 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 Py_DECREF(restuple);
7016 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007017 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007018 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 &resunicode, newpos)) {
7020 Py_DECREF(restuple);
7021 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007022 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007023 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7024 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7025 Py_DECREF(restuple);
7026 return NULL;
7027 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007028 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029 *newpos = len + *newpos;
7030 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007031 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 Py_DECREF(restuple);
7033 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007034 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007035 Py_INCREF(resunicode);
7036 Py_DECREF(restuple);
7037 return resunicode;
7038}
7039
Alexander Belopolsky40018472011-02-26 01:02:56 +00007040static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007041unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007042 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007043 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007045 /* input state */
7046 Py_ssize_t pos=0, size;
7047 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007048 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007049 /* pointer into the output */
7050 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007051 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7052 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007053 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007055 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007056 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007057 /* output object */
7058 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059
Benjamin Petersonbac79492012-01-14 13:34:47 -05007060 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007061 return NULL;
7062 size = PyUnicode_GET_LENGTH(unicode);
7063 kind = PyUnicode_KIND(unicode);
7064 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065 /* allocate enough for a simple encoding without
7066 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007067 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007068 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007069
7070 _PyBytesWriter_Init(&writer);
7071 str = _PyBytesWriter_Alloc(&writer, size);
7072 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007073 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007075 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007076 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007077
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007079 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007080 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007081 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007085 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007087 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007088 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007090
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007091 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007093
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007094 /* Only overallocate the buffer if it's not the last write */
7095 writer.overallocate = (collend < size);
7096
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007098 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007099 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007100
7101 switch (error_handler) {
7102 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007103 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007105
7106 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007107 memset(str, '?', collend - collstart);
7108 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007109 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007110 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007111 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 break;
Victor Stinner50149202015-09-22 00:26:54 +02007113
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007114 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007115 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007116 writer.min_size -= (collend - collstart);
7117 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007118 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007119 if (str == NULL)
7120 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007121 pos = collend;
7122 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007123
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007124 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007125 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007126 writer.min_size -= (collend - collstart);
7127 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007128 unicode, collstart, collend);
7129 if (str == NULL)
7130 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007131 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 break;
Victor Stinner50149202015-09-22 00:26:54 +02007133
Victor Stinnerc3713e92015-09-29 12:32:13 +02007134 case _Py_ERROR_SURROGATEESCAPE:
7135 for (i = collstart; i < collend; ++i) {
7136 ch = PyUnicode_READ(kind, data, i);
7137 if (ch < 0xdc80 || 0xdcff < ch) {
7138 /* Not a UTF-8b surrogate */
7139 break;
7140 }
7141 *str++ = (char)(ch - 0xdc00);
7142 ++pos;
7143 }
7144 if (i >= collend)
7145 break;
7146 collstart = pos;
7147 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007148 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007149
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007151 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7152 encoding, reason, unicode, &exc,
7153 collstart, collend, &newpos);
7154 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007156
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007157 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007158 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007159
Victor Stinner6bd525b2015-10-09 13:10:05 +02007160 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007161 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007162 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007163 PyBytes_AS_STRING(rep),
7164 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007165 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007166 else {
7167 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007168
Victor Stinner6bd525b2015-10-09 13:10:05 +02007169 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007171
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007172 if (limit == 256 ?
7173 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7174 !PyUnicode_IS_ASCII(rep))
7175 {
7176 /* Not all characters are smaller than limit */
7177 raise_encode_exception(&exc, encoding, unicode,
7178 collstart, collend, reason);
7179 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007181 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7182 str = _PyBytesWriter_WriteBytes(&writer, str,
7183 PyUnicode_DATA(rep),
7184 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007185 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007186 if (str == NULL)
7187 goto onError;
7188
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007189 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007190 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007191 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007192
7193 /* If overallocation was disabled, ensure that it was the last
7194 write. Otherwise, we missed an optimization */
7195 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007196 }
7197 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007198
Victor Stinner50149202015-09-22 00:26:54 +02007199 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007200 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007201 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007202
7203 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007204 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007205 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007206 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007207 Py_XDECREF(exc);
7208 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007209}
7210
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007211/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007212PyObject *
7213PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007214 Py_ssize_t size,
7215 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007217 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007218 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007219 if (unicode == NULL)
7220 return NULL;
7221 result = unicode_encode_ucs1(unicode, errors, 256);
7222 Py_DECREF(unicode);
7223 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224}
7225
Alexander Belopolsky40018472011-02-26 01:02:56 +00007226PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007227_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228{
7229 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 PyErr_BadArgument();
7231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007233 if (PyUnicode_READY(unicode) == -1)
7234 return NULL;
7235 /* Fast path: if it is a one-byte string, construct
7236 bytes object directly. */
7237 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7238 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7239 PyUnicode_GET_LENGTH(unicode));
7240 /* Non-Latin-1 characters present. Defer to above function to
7241 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007242 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007243}
7244
7245PyObject*
7246PyUnicode_AsLatin1String(PyObject *unicode)
7247{
7248 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249}
7250
7251/* --- 7-bit ASCII Codec -------------------------------------------------- */
7252
Alexander Belopolsky40018472011-02-26 01:02:56 +00007253PyObject *
7254PyUnicode_DecodeASCII(const char *s,
7255 Py_ssize_t size,
7256 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007259 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007260 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007262 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007263
Guido van Rossumd57fd912000-03-10 22:53:23 +00007264 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007265 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007266
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007268 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007269 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007271
Inada Naoki770847a2019-06-24 12:30:24 +09007272 // Shortcut for simple case
7273 PyObject *u = PyUnicode_New(size, 127);
7274 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007275 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007276 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007277 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007278 if (outpos == size) {
7279 return u;
7280 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007281
Inada Naoki770847a2019-06-24 12:30:24 +09007282 _PyUnicodeWriter writer;
7283 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007284 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007285
Inada Naoki770847a2019-06-24 12:30:24 +09007286 s += outpos;
7287 int kind = writer.kind;
7288 void *data = writer.data;
7289 Py_ssize_t startinpos, endinpos;
7290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007291 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007292 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007294 PyUnicode_WRITE(kind, data, writer.pos, c);
7295 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007297 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007299
7300 /* byte outsize range 0x00..0x7f: call the error handler */
7301
7302 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007303 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007304
7305 switch (error_handler)
7306 {
7307 case _Py_ERROR_REPLACE:
7308 case _Py_ERROR_SURROGATEESCAPE:
7309 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007310 but we may switch to UCS2 at the first write */
7311 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7312 goto onError;
7313 kind = writer.kind;
7314 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007315
7316 if (error_handler == _Py_ERROR_REPLACE)
7317 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7318 else
7319 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7320 writer.pos++;
7321 ++s;
7322 break;
7323
7324 case _Py_ERROR_IGNORE:
7325 ++s;
7326 break;
7327
7328 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 startinpos = s-starts;
7330 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007331 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007332 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 "ascii", "ordinal not in range(128)",
7334 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007335 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007337 kind = writer.kind;
7338 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007340 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007341 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007342 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007343 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007344
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007346 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007347 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 return NULL;
7350}
7351
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007352/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007353PyObject *
7354PyUnicode_EncodeASCII(const Py_UNICODE *p,
7355 Py_ssize_t size,
7356 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007358 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007359 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007360 if (unicode == NULL)
7361 return NULL;
7362 result = unicode_encode_ucs1(unicode, errors, 128);
7363 Py_DECREF(unicode);
7364 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365}
7366
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007368_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369{
7370 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 PyErr_BadArgument();
7372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007374 if (PyUnicode_READY(unicode) == -1)
7375 return NULL;
7376 /* Fast path: if it is an ASCII-only string, construct bytes object
7377 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007378 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007379 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7380 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007381 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007382}
7383
7384PyObject *
7385PyUnicode_AsASCIIString(PyObject *unicode)
7386{
7387 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388}
7389
Steve Dowercc16be82016-09-08 10:35:16 -07007390#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007391
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007392/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007393
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007394#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395#define NEED_RETRY
7396#endif
7397
Steve Dower7ebdda02019-08-21 16:22:33 -07007398/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7399 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7400 both cases also and avoids partial characters overrunning the
7401 length limit in MultiByteToWideChar on Windows */
7402#define DECODING_CHUNK_SIZE (INT_MAX/4)
7403
Victor Stinner3a50e702011-10-18 21:21:00 +02007404#ifndef WC_ERR_INVALID_CHARS
7405# define WC_ERR_INVALID_CHARS 0x0080
7406#endif
7407
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007408static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007409code_page_name(UINT code_page, PyObject **obj)
7410{
7411 *obj = NULL;
7412 if (code_page == CP_ACP)
7413 return "mbcs";
7414 if (code_page == CP_UTF7)
7415 return "CP_UTF7";
7416 if (code_page == CP_UTF8)
7417 return "CP_UTF8";
7418
7419 *obj = PyBytes_FromFormat("cp%u", code_page);
7420 if (*obj == NULL)
7421 return NULL;
7422 return PyBytes_AS_STRING(*obj);
7423}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424
Victor Stinner3a50e702011-10-18 21:21:00 +02007425static DWORD
7426decode_code_page_flags(UINT code_page)
7427{
7428 if (code_page == CP_UTF7) {
7429 /* The CP_UTF7 decoder only supports flags=0 */
7430 return 0;
7431 }
7432 else
7433 return MB_ERR_INVALID_CHARS;
7434}
7435
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 * Decode a byte string from a Windows code page into unicode object in strict
7438 * mode.
7439 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007440 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7441 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007443static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007444decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007445 wchar_t **buf,
7446 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 const char *in,
7448 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007450 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007451 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453
7454 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007456 while ((outsize = MultiByteToWideChar(code_page, flags,
7457 in, insize, NULL, 0)) <= 0)
7458 {
7459 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7460 goto error;
7461 }
7462 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7463 flags = 0;
7464 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007466 /* Extend a wchar_t* buffer */
7467 Py_ssize_t n = *bufsize; /* Get the current length */
7468 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7469 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007471 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472
7473 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7475 if (outsize <= 0)
7476 goto error;
7477 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007478
Victor Stinner3a50e702011-10-18 21:21:00 +02007479error:
7480 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7481 return -2;
7482 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007483 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484}
7485
Victor Stinner3a50e702011-10-18 21:21:00 +02007486/*
7487 * Decode a byte string from a code page into unicode object with an error
7488 * handler.
7489 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007490 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 * UnicodeDecodeError exception and returns -1 on error.
7492 */
7493static int
7494decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007495 wchar_t **buf,
7496 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007498 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007499{
7500 const char *startin = in;
7501 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007502 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 /* Ideally, we should get reason from FormatMessage. This is the Windows
7504 2000 English version of the message. */
7505 const char *reason = "No mapping for the Unicode character exists "
7506 "in the target code page.";
7507 /* each step cannot decode more than 1 character, but a character can be
7508 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007509 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007510 int insize;
7511 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 PyObject *errorHandler = NULL;
7513 PyObject *exc = NULL;
7514 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007515 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 DWORD err;
7517 int ret = -1;
7518
7519 assert(size > 0);
7520
7521 encoding = code_page_name(code_page, &encoding_obj);
7522 if (encoding == NULL)
7523 return -1;
7524
Victor Stinner7d00cc12014-03-17 23:08:06 +01007525 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7527 UnicodeDecodeError. */
7528 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7529 if (exc != NULL) {
7530 PyCodec_StrictErrors(exc);
7531 Py_CLEAR(exc);
7532 }
7533 goto error;
7534 }
7535
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007536 /* Extend a wchar_t* buffer */
7537 Py_ssize_t n = *bufsize; /* Get the current length */
7538 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7539 PyErr_NoMemory();
7540 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007542 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7543 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007545 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007546
7547 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 while (in < endin)
7549 {
7550 /* Decode a character */
7551 insize = 1;
7552 do
7553 {
7554 outsize = MultiByteToWideChar(code_page, flags,
7555 in, insize,
7556 buffer, Py_ARRAY_LENGTH(buffer));
7557 if (outsize > 0)
7558 break;
7559 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007560 if (err == ERROR_INVALID_FLAGS && flags) {
7561 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7562 flags = 0;
7563 continue;
7564 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 if (err != ERROR_NO_UNICODE_TRANSLATION
7566 && err != ERROR_INSUFFICIENT_BUFFER)
7567 {
7568 PyErr_SetFromWindowsErr(0);
7569 goto error;
7570 }
7571 insize++;
7572 }
7573 /* 4=maximum length of a UTF-8 sequence */
7574 while (insize <= 4 && (in + insize) <= endin);
7575
7576 if (outsize <= 0) {
7577 Py_ssize_t startinpos, endinpos, outpos;
7578
Victor Stinner7d00cc12014-03-17 23:08:06 +01007579 /* last character in partial decode? */
7580 if (in + insize >= endin && !final)
7581 break;
7582
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 startinpos = in - startin;
7584 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007585 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007586 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 errors, &errorHandler,
7588 encoding, reason,
7589 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007590 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 {
7592 goto error;
7593 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007594 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 }
7596 else {
7597 in += insize;
7598 memcpy(out, buffer, outsize * sizeof(wchar_t));
7599 out += outsize;
7600 }
7601 }
7602
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007603 /* Shrink the buffer */
7604 assert(out - *buf <= *bufsize);
7605 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007606 /* (in - startin) <= size and size is an int */
7607 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007608
7609error:
7610 Py_XDECREF(encoding_obj);
7611 Py_XDECREF(errorHandler);
7612 Py_XDECREF(exc);
7613 return ret;
7614}
7615
Victor Stinner3a50e702011-10-18 21:21:00 +02007616static PyObject *
7617decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007618 const char *s, Py_ssize_t size,
7619 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007620{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007621 wchar_t *buf = NULL;
7622 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007623 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007624
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 if (code_page < 0) {
7626 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7627 return NULL;
7628 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007629 if (size < 0) {
7630 PyErr_BadInternalCall();
7631 return NULL;
7632 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007633
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007634 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636
Victor Stinner76a31a62011-11-04 00:05:13 +01007637 do
7638 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007639#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007640 if (size > DECODING_CHUNK_SIZE) {
7641 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 final = 0;
7643 done = 0;
7644 }
7645 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007646#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007647 {
7648 chunk_size = (int)size;
7649 final = (consumed == NULL);
7650 done = 1;
7651 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007652
Victor Stinner76a31a62011-11-04 00:05:13 +01007653 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007654 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007655 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007656 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007657 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007658
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007659 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007660 s, chunk_size);
7661 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007662 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007663 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007664 errors, final);
7665 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007666
7667 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007668 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007669 return NULL;
7670 }
7671
7672 if (consumed)
7673 *consumed += converted;
7674
7675 s += converted;
7676 size -= converted;
7677 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007678
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007679 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7680 PyMem_Free(buf);
7681 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007682}
7683
Alexander Belopolsky40018472011-02-26 01:02:56 +00007684PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007685PyUnicode_DecodeCodePageStateful(int code_page,
7686 const char *s,
7687 Py_ssize_t size,
7688 const char *errors,
7689 Py_ssize_t *consumed)
7690{
7691 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7692}
7693
7694PyObject *
7695PyUnicode_DecodeMBCSStateful(const char *s,
7696 Py_ssize_t size,
7697 const char *errors,
7698 Py_ssize_t *consumed)
7699{
7700 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7701}
7702
7703PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007704PyUnicode_DecodeMBCS(const char *s,
7705 Py_ssize_t size,
7706 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007707{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007708 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7709}
7710
Victor Stinner3a50e702011-10-18 21:21:00 +02007711static DWORD
7712encode_code_page_flags(UINT code_page, const char *errors)
7713{
7714 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007715 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 }
7717 else if (code_page == CP_UTF7) {
7718 /* CP_UTF7 only supports flags=0 */
7719 return 0;
7720 }
7721 else {
7722 if (errors != NULL && strcmp(errors, "replace") == 0)
7723 return 0;
7724 else
7725 return WC_NO_BEST_FIT_CHARS;
7726 }
7727}
7728
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007730 * Encode a Unicode string to a Windows code page into a byte string in strict
7731 * mode.
7732 *
7733 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007734 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007735 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007736static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007737encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007739 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007740{
Victor Stinner554f3f02010-06-16 23:33:54 +00007741 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007742 BOOL *pusedDefaultChar = &usedDefaultChar;
7743 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007744 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007745 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 const DWORD flags = encode_code_page_flags(code_page, NULL);
7747 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007748 /* Create a substring so that we can get the UTF-16 representation
7749 of just the slice under consideration. */
7750 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751
Martin v. Löwis3d325192011-11-04 18:23:06 +01007752 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007753
Victor Stinner3a50e702011-10-18 21:21:00 +02007754 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007755 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007756 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007757 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007758
Victor Stinner2fc507f2011-11-04 20:06:39 +01007759 substring = PyUnicode_Substring(unicode, offset, offset+len);
7760 if (substring == NULL)
7761 return -1;
7762 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7763 if (p == NULL) {
7764 Py_DECREF(substring);
7765 return -1;
7766 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007767 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007768
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007769 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007770 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007771 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007772 NULL, 0,
7773 NULL, pusedDefaultChar);
7774 if (outsize <= 0)
7775 goto error;
7776 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007777 if (pusedDefaultChar && *pusedDefaultChar) {
7778 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007779 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007780 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007781
Victor Stinner3a50e702011-10-18 21:21:00 +02007782 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007784 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007785 if (*outbytes == NULL) {
7786 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007788 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007789 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007790 }
7791 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007793 const Py_ssize_t n = PyBytes_Size(*outbytes);
7794 if (outsize > PY_SSIZE_T_MAX - n) {
7795 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007796 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007798 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007799 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7800 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007801 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007802 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007803 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007804 }
7805
7806 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007807 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007808 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007809 out, outsize,
7810 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007811 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007812 if (outsize <= 0)
7813 goto error;
7814 if (pusedDefaultChar && *pusedDefaultChar)
7815 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007816 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007817
Victor Stinner3a50e702011-10-18 21:21:00 +02007818error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007819 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007820 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7821 return -2;
7822 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007823 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007824}
7825
Victor Stinner3a50e702011-10-18 21:21:00 +02007826/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007827 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007828 * error handler.
7829 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007830 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 * -1 on other error.
7832 */
7833static int
7834encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007835 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007836 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007837{
Victor Stinner3a50e702011-10-18 21:21:00 +02007838 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007839 Py_ssize_t pos = unicode_offset;
7840 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007841 /* Ideally, we should get reason from FormatMessage. This is the Windows
7842 2000 English version of the message. */
7843 const char *reason = "invalid character";
7844 /* 4=maximum length of a UTF-8 sequence */
7845 char buffer[4];
7846 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7847 Py_ssize_t outsize;
7848 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007849 PyObject *errorHandler = NULL;
7850 PyObject *exc = NULL;
7851 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007852 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007853 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007854 PyObject *rep;
7855 int ret = -1;
7856
7857 assert(insize > 0);
7858
7859 encoding = code_page_name(code_page, &encoding_obj);
7860 if (encoding == NULL)
7861 return -1;
7862
7863 if (errors == NULL || strcmp(errors, "strict") == 0) {
7864 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7865 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007866 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007867 if (exc != NULL) {
7868 PyCodec_StrictErrors(exc);
7869 Py_DECREF(exc);
7870 }
7871 Py_XDECREF(encoding_obj);
7872 return -1;
7873 }
7874
7875 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7876 pusedDefaultChar = &usedDefaultChar;
7877 else
7878 pusedDefaultChar = NULL;
7879
7880 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7881 PyErr_NoMemory();
7882 goto error;
7883 }
7884 outsize = insize * Py_ARRAY_LENGTH(buffer);
7885
7886 if (*outbytes == NULL) {
7887 /* Create string object */
7888 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7889 if (*outbytes == NULL)
7890 goto error;
7891 out = PyBytes_AS_STRING(*outbytes);
7892 }
7893 else {
7894 /* Extend string object */
7895 Py_ssize_t n = PyBytes_Size(*outbytes);
7896 if (n > PY_SSIZE_T_MAX - outsize) {
7897 PyErr_NoMemory();
7898 goto error;
7899 }
7900 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7901 goto error;
7902 out = PyBytes_AS_STRING(*outbytes) + n;
7903 }
7904
7905 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007906 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007907 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007908 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7909 wchar_t chars[2];
7910 int charsize;
7911 if (ch < 0x10000) {
7912 chars[0] = (wchar_t)ch;
7913 charsize = 1;
7914 }
7915 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007916 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7917 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007918 charsize = 2;
7919 }
7920
Victor Stinner3a50e702011-10-18 21:21:00 +02007921 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007922 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007923 buffer, Py_ARRAY_LENGTH(buffer),
7924 NULL, pusedDefaultChar);
7925 if (outsize > 0) {
7926 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7927 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007928 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007929 memcpy(out, buffer, outsize);
7930 out += outsize;
7931 continue;
7932 }
7933 }
7934 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7935 PyErr_SetFromWindowsErr(0);
7936 goto error;
7937 }
7938
Victor Stinner3a50e702011-10-18 21:21:00 +02007939 rep = unicode_encode_call_errorhandler(
7940 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007941 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007942 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007943 if (rep == NULL)
7944 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007945 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007946
7947 if (PyBytes_Check(rep)) {
7948 outsize = PyBytes_GET_SIZE(rep);
7949 if (outsize != 1) {
7950 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7951 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7952 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7953 Py_DECREF(rep);
7954 goto error;
7955 }
7956 out = PyBytes_AS_STRING(*outbytes) + offset;
7957 }
7958 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7959 out += outsize;
7960 }
7961 else {
7962 Py_ssize_t i;
7963 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007964 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007965
Benjamin Petersonbac79492012-01-14 13:34:47 -05007966 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007967 Py_DECREF(rep);
7968 goto error;
7969 }
7970
7971 outsize = PyUnicode_GET_LENGTH(rep);
7972 if (outsize != 1) {
7973 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7974 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7975 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7976 Py_DECREF(rep);
7977 goto error;
7978 }
7979 out = PyBytes_AS_STRING(*outbytes) + offset;
7980 }
7981 kind = PyUnicode_KIND(rep);
7982 data = PyUnicode_DATA(rep);
7983 for (i=0; i < outsize; i++) {
7984 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7985 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007986 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007987 encoding, unicode,
7988 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007989 "unable to encode error handler result to ASCII");
7990 Py_DECREF(rep);
7991 goto error;
7992 }
7993 *out = (unsigned char)ch;
7994 out++;
7995 }
7996 }
7997 Py_DECREF(rep);
7998 }
7999 /* write a NUL byte */
8000 *out = 0;
8001 outsize = out - PyBytes_AS_STRING(*outbytes);
8002 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8003 if (_PyBytes_Resize(outbytes, outsize) < 0)
8004 goto error;
8005 ret = 0;
8006
8007error:
8008 Py_XDECREF(encoding_obj);
8009 Py_XDECREF(errorHandler);
8010 Py_XDECREF(exc);
8011 return ret;
8012}
8013
Victor Stinner3a50e702011-10-18 21:21:00 +02008014static PyObject *
8015encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008016 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008017 const char *errors)
8018{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008019 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008020 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008021 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008022 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008023
Victor Stinner29dacf22015-01-26 16:41:32 +01008024 if (!PyUnicode_Check(unicode)) {
8025 PyErr_BadArgument();
8026 return NULL;
8027 }
8028
Benjamin Petersonbac79492012-01-14 13:34:47 -05008029 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008030 return NULL;
8031 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008032
Victor Stinner3a50e702011-10-18 21:21:00 +02008033 if (code_page < 0) {
8034 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8035 return NULL;
8036 }
8037
Martin v. Löwis3d325192011-11-04 18:23:06 +01008038 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008039 return PyBytes_FromStringAndSize(NULL, 0);
8040
Victor Stinner7581cef2011-11-03 22:32:33 +01008041 offset = 0;
8042 do
8043 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008044#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008045 if (len > DECODING_CHUNK_SIZE) {
8046 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008047 done = 0;
8048 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008049 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008050#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008051 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008052 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008053 done = 1;
8054 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008055
Victor Stinner76a31a62011-11-04 00:05:13 +01008056 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008057 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008058 errors);
8059 if (ret == -2)
8060 ret = encode_code_page_errors(code_page, &outbytes,
8061 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008062 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008063 if (ret < 0) {
8064 Py_XDECREF(outbytes);
8065 return NULL;
8066 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008067
Victor Stinner7581cef2011-11-03 22:32:33 +01008068 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008069 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008070 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008071
Victor Stinner3a50e702011-10-18 21:21:00 +02008072 return outbytes;
8073}
8074
8075PyObject *
8076PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8077 Py_ssize_t size,
8078 const char *errors)
8079{
Victor Stinner7581cef2011-11-03 22:32:33 +01008080 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008081 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008082 if (unicode == NULL)
8083 return NULL;
8084 res = encode_code_page(CP_ACP, unicode, errors);
8085 Py_DECREF(unicode);
8086 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008087}
8088
8089PyObject *
8090PyUnicode_EncodeCodePage(int code_page,
8091 PyObject *unicode,
8092 const char *errors)
8093{
Victor Stinner7581cef2011-11-03 22:32:33 +01008094 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008095}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008096
Alexander Belopolsky40018472011-02-26 01:02:56 +00008097PyObject *
8098PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008099{
Victor Stinner7581cef2011-11-03 22:32:33 +01008100 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008101}
8102
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008103#undef NEED_RETRY
8104
Steve Dowercc16be82016-09-08 10:35:16 -07008105#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008106
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107/* --- Character Mapping Codec -------------------------------------------- */
8108
Victor Stinnerfb161b12013-04-18 01:44:27 +02008109static int
8110charmap_decode_string(const char *s,
8111 Py_ssize_t size,
8112 PyObject *mapping,
8113 const char *errors,
8114 _PyUnicodeWriter *writer)
8115{
8116 const char *starts = s;
8117 const char *e;
8118 Py_ssize_t startinpos, endinpos;
8119 PyObject *errorHandler = NULL, *exc = NULL;
8120 Py_ssize_t maplen;
8121 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008122 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008123 Py_UCS4 x;
8124 unsigned char ch;
8125
8126 if (PyUnicode_READY(mapping) == -1)
8127 return -1;
8128
8129 maplen = PyUnicode_GET_LENGTH(mapping);
8130 mapdata = PyUnicode_DATA(mapping);
8131 mapkind = PyUnicode_KIND(mapping);
8132
8133 e = s + size;
8134
8135 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8136 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8137 * is disabled in encoding aliases, latin1 is preferred because
8138 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008139 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008140 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8141 Py_UCS4 maxchar = writer->maxchar;
8142
8143 assert (writer->kind == PyUnicode_1BYTE_KIND);
8144 while (s < e) {
8145 ch = *s;
8146 x = mapdata_ucs1[ch];
8147 if (x > maxchar) {
8148 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8149 goto onError;
8150 maxchar = writer->maxchar;
8151 outdata = (Py_UCS1 *)writer->data;
8152 }
8153 outdata[writer->pos] = x;
8154 writer->pos++;
8155 ++s;
8156 }
8157 return 0;
8158 }
8159
8160 while (s < e) {
8161 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8162 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008163 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008164 if (outkind == PyUnicode_1BYTE_KIND) {
8165 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8166 Py_UCS4 maxchar = writer->maxchar;
8167 while (s < e) {
8168 ch = *s;
8169 x = mapdata_ucs2[ch];
8170 if (x > maxchar)
8171 goto Error;
8172 outdata[writer->pos] = x;
8173 writer->pos++;
8174 ++s;
8175 }
8176 break;
8177 }
8178 else if (outkind == PyUnicode_2BYTE_KIND) {
8179 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8180 while (s < e) {
8181 ch = *s;
8182 x = mapdata_ucs2[ch];
8183 if (x == 0xFFFE)
8184 goto Error;
8185 outdata[writer->pos] = x;
8186 writer->pos++;
8187 ++s;
8188 }
8189 break;
8190 }
8191 }
8192 ch = *s;
8193
8194 if (ch < maplen)
8195 x = PyUnicode_READ(mapkind, mapdata, ch);
8196 else
8197 x = 0xfffe; /* invalid value */
8198Error:
8199 if (x == 0xfffe)
8200 {
8201 /* undefined mapping */
8202 startinpos = s-starts;
8203 endinpos = startinpos+1;
8204 if (unicode_decode_call_errorhandler_writer(
8205 errors, &errorHandler,
8206 "charmap", "character maps to <undefined>",
8207 &starts, &e, &startinpos, &endinpos, &exc, &s,
8208 writer)) {
8209 goto onError;
8210 }
8211 continue;
8212 }
8213
8214 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8215 goto onError;
8216 ++s;
8217 }
8218 Py_XDECREF(errorHandler);
8219 Py_XDECREF(exc);
8220 return 0;
8221
8222onError:
8223 Py_XDECREF(errorHandler);
8224 Py_XDECREF(exc);
8225 return -1;
8226}
8227
8228static int
8229charmap_decode_mapping(const char *s,
8230 Py_ssize_t size,
8231 PyObject *mapping,
8232 const char *errors,
8233 _PyUnicodeWriter *writer)
8234{
8235 const char *starts = s;
8236 const char *e;
8237 Py_ssize_t startinpos, endinpos;
8238 PyObject *errorHandler = NULL, *exc = NULL;
8239 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008240 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008241
8242 e = s + size;
8243
8244 while (s < e) {
8245 ch = *s;
8246
8247 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8248 key = PyLong_FromLong((long)ch);
8249 if (key == NULL)
8250 goto onError;
8251
8252 item = PyObject_GetItem(mapping, key);
8253 Py_DECREF(key);
8254 if (item == NULL) {
8255 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8256 /* No mapping found means: mapping is undefined. */
8257 PyErr_Clear();
8258 goto Undefined;
8259 } else
8260 goto onError;
8261 }
8262
8263 /* Apply mapping */
8264 if (item == Py_None)
8265 goto Undefined;
8266 if (PyLong_Check(item)) {
8267 long value = PyLong_AS_LONG(item);
8268 if (value == 0xFFFE)
8269 goto Undefined;
8270 if (value < 0 || value > MAX_UNICODE) {
8271 PyErr_Format(PyExc_TypeError,
8272 "character mapping must be in range(0x%lx)",
8273 (unsigned long)MAX_UNICODE + 1);
8274 goto onError;
8275 }
8276
8277 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8278 goto onError;
8279 }
8280 else if (PyUnicode_Check(item)) {
8281 if (PyUnicode_READY(item) == -1)
8282 goto onError;
8283 if (PyUnicode_GET_LENGTH(item) == 1) {
8284 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8285 if (value == 0xFFFE)
8286 goto Undefined;
8287 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8288 goto onError;
8289 }
8290 else {
8291 writer->overallocate = 1;
8292 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8293 goto onError;
8294 }
8295 }
8296 else {
8297 /* wrong return value */
8298 PyErr_SetString(PyExc_TypeError,
8299 "character mapping must return integer, None or str");
8300 goto onError;
8301 }
8302 Py_CLEAR(item);
8303 ++s;
8304 continue;
8305
8306Undefined:
8307 /* undefined mapping */
8308 Py_CLEAR(item);
8309 startinpos = s-starts;
8310 endinpos = startinpos+1;
8311 if (unicode_decode_call_errorhandler_writer(
8312 errors, &errorHandler,
8313 "charmap", "character maps to <undefined>",
8314 &starts, &e, &startinpos, &endinpos, &exc, &s,
8315 writer)) {
8316 goto onError;
8317 }
8318 }
8319 Py_XDECREF(errorHandler);
8320 Py_XDECREF(exc);
8321 return 0;
8322
8323onError:
8324 Py_XDECREF(item);
8325 Py_XDECREF(errorHandler);
8326 Py_XDECREF(exc);
8327 return -1;
8328}
8329
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330PyObject *
8331PyUnicode_DecodeCharmap(const char *s,
8332 Py_ssize_t size,
8333 PyObject *mapping,
8334 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008336 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008337
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 /* Default to Latin-1 */
8339 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008343 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008344 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008345 writer.min_length = size;
8346 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008348
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008349 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008350 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8351 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008352 }
8353 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008354 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8355 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008357 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008358
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008360 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361 return NULL;
8362}
8363
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008364/* Charmap encoding: the lookup table */
8365
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 PyObject_HEAD
8368 unsigned char level1[32];
8369 int count2, count3;
8370 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008371};
8372
8373static PyObject*
8374encoding_map_size(PyObject *obj, PyObject* args)
8375{
8376 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008377 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008379}
8380
8381static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008382 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 PyDoc_STR("Return the size (in bytes) of this object") },
8384 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385};
8386
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008387static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008388 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 "EncodingMap", /*tp_name*/
8390 sizeof(struct encoding_map), /*tp_basicsize*/
8391 0, /*tp_itemsize*/
8392 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008393 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008394 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 0, /*tp_getattr*/
8396 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008397 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 0, /*tp_repr*/
8399 0, /*tp_as_number*/
8400 0, /*tp_as_sequence*/
8401 0, /*tp_as_mapping*/
8402 0, /*tp_hash*/
8403 0, /*tp_call*/
8404 0, /*tp_str*/
8405 0, /*tp_getattro*/
8406 0, /*tp_setattro*/
8407 0, /*tp_as_buffer*/
8408 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8409 0, /*tp_doc*/
8410 0, /*tp_traverse*/
8411 0, /*tp_clear*/
8412 0, /*tp_richcompare*/
8413 0, /*tp_weaklistoffset*/
8414 0, /*tp_iter*/
8415 0, /*tp_iternext*/
8416 encoding_map_methods, /*tp_methods*/
8417 0, /*tp_members*/
8418 0, /*tp_getset*/
8419 0, /*tp_base*/
8420 0, /*tp_dict*/
8421 0, /*tp_descr_get*/
8422 0, /*tp_descr_set*/
8423 0, /*tp_dictoffset*/
8424 0, /*tp_init*/
8425 0, /*tp_alloc*/
8426 0, /*tp_new*/
8427 0, /*tp_free*/
8428 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008429};
8430
8431PyObject*
8432PyUnicode_BuildEncodingMap(PyObject* string)
8433{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008434 PyObject *result;
8435 struct encoding_map *mresult;
8436 int i;
8437 int need_dict = 0;
8438 unsigned char level1[32];
8439 unsigned char level2[512];
8440 unsigned char *mlevel1, *mlevel2, *mlevel3;
8441 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008442 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008443 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008444 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008445 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008446
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008447 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008448 PyErr_BadArgument();
8449 return NULL;
8450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 kind = PyUnicode_KIND(string);
8452 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008453 length = PyUnicode_GET_LENGTH(string);
8454 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 memset(level1, 0xFF, sizeof level1);
8456 memset(level2, 0xFF, sizeof level2);
8457
8458 /* If there isn't a one-to-one mapping of NULL to \0,
8459 or if there are non-BMP characters, we need to use
8460 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008462 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008463 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008464 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 ch = PyUnicode_READ(kind, data, i);
8466 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008467 need_dict = 1;
8468 break;
8469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008471 /* unmapped character */
8472 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 l1 = ch >> 11;
8474 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008475 if (level1[l1] == 0xFF)
8476 level1[l1] = count2++;
8477 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008479 }
8480
8481 if (count2 >= 0xFF || count3 >= 0xFF)
8482 need_dict = 1;
8483
8484 if (need_dict) {
8485 PyObject *result = PyDict_New();
8486 PyObject *key, *value;
8487 if (!result)
8488 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008489 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008491 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008492 if (!key || !value)
8493 goto failed1;
8494 if (PyDict_SetItem(result, key, value) == -1)
8495 goto failed1;
8496 Py_DECREF(key);
8497 Py_DECREF(value);
8498 }
8499 return result;
8500 failed1:
8501 Py_XDECREF(key);
8502 Py_XDECREF(value);
8503 Py_DECREF(result);
8504 return NULL;
8505 }
8506
8507 /* Create a three-level trie */
8508 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8509 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008510 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008511 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008512 }
8513
8514 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008515 mresult = (struct encoding_map*)result;
8516 mresult->count2 = count2;
8517 mresult->count3 = count3;
8518 mlevel1 = mresult->level1;
8519 mlevel2 = mresult->level23;
8520 mlevel3 = mresult->level23 + 16*count2;
8521 memcpy(mlevel1, level1, 32);
8522 memset(mlevel2, 0xFF, 16*count2);
8523 memset(mlevel3, 0, 128*count3);
8524 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008525 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008526 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008527 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8528 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008529 /* unmapped character */
8530 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008531 o1 = ch>>11;
8532 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008533 i2 = 16*mlevel1[o1] + o2;
8534 if (mlevel2[i2] == 0xFF)
8535 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008536 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008537 i3 = 128*mlevel2[i2] + o3;
8538 mlevel3[i3] = i;
8539 }
8540 return result;
8541}
8542
8543static int
Victor Stinner22168992011-11-20 17:09:18 +01008544encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008545{
8546 struct encoding_map *map = (struct encoding_map*)mapping;
8547 int l1 = c>>11;
8548 int l2 = (c>>7) & 0xF;
8549 int l3 = c & 0x7F;
8550 int i;
8551
Victor Stinner22168992011-11-20 17:09:18 +01008552 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008554 if (c == 0)
8555 return 0;
8556 /* level 1*/
8557 i = map->level1[l1];
8558 if (i == 0xFF) {
8559 return -1;
8560 }
8561 /* level 2*/
8562 i = map->level23[16*i+l2];
8563 if (i == 0xFF) {
8564 return -1;
8565 }
8566 /* level 3 */
8567 i = map->level23[16*map->count2 + 128*i + l3];
8568 if (i == 0) {
8569 return -1;
8570 }
8571 return i;
8572}
8573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574/* Lookup the character ch in the mapping. If the character
8575 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008576 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008577static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008578charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579{
Christian Heimes217cfd12007-12-02 14:31:20 +00008580 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 PyObject *x;
8582
8583 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 x = PyObject_GetItem(mapping, w);
8586 Py_DECREF(w);
8587 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8589 /* No mapping found means: mapping is undefined. */
8590 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008591 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 } else
8593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008595 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008597 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 long value = PyLong_AS_LONG(x);
8599 if (value < 0 || value > 255) {
8600 PyErr_SetString(PyExc_TypeError,
8601 "character mapping must be in range(256)");
8602 Py_DECREF(x);
8603 return NULL;
8604 }
8605 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008607 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 /* wrong return value */
8611 PyErr_Format(PyExc_TypeError,
8612 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008613 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 Py_DECREF(x);
8615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 }
8617}
8618
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008619static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008620charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008621{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008622 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8623 /* exponentially overallocate to minimize reallocations */
8624 if (requiredsize < 2*outsize)
8625 requiredsize = 2*outsize;
8626 if (_PyBytes_Resize(outobj, requiredsize))
8627 return -1;
8628 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008629}
8630
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008635 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 space is available. Return a new reference to the object that
8637 was put in the output buffer, or Py_None, if the mapping was undefined
8638 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008639 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008640static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008641charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008642 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008644 PyObject *rep;
8645 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008646 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647
Andy Lesterdffe4c02020-03-04 07:15:20 -06008648 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008649 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008651 if (res == -1)
8652 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 if (outsize<requiredsize)
8654 if (charmapencode_resize(outobj, outpos, requiredsize))
8655 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008656 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 outstart[(*outpos)++] = (char)res;
8658 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008659 }
8660
8661 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008664 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 Py_DECREF(rep);
8666 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008667 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 if (PyLong_Check(rep)) {
8669 Py_ssize_t requiredsize = *outpos+1;
8670 if (outsize<requiredsize)
8671 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8672 Py_DECREF(rep);
8673 return enc_EXCEPTION;
8674 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008675 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008677 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 else {
8679 const char *repchars = PyBytes_AS_STRING(rep);
8680 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8681 Py_ssize_t requiredsize = *outpos+repsize;
8682 if (outsize<requiredsize)
8683 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8684 Py_DECREF(rep);
8685 return enc_EXCEPTION;
8686 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008687 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 memcpy(outstart + *outpos, repchars, repsize);
8689 *outpos += repsize;
8690 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008692 Py_DECREF(rep);
8693 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694}
8695
8696/* handle an error in PyUnicode_EncodeCharmap
8697 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008698static int
8699charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008700 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008702 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008703 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704{
8705 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008706 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008707 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008708 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008709 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008710 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008712 Py_ssize_t collstartpos = *inpos;
8713 Py_ssize_t collendpos = *inpos+1;
8714 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008715 const char *encoding = "charmap";
8716 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008717 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008718 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008719 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720
Benjamin Petersonbac79492012-01-14 13:34:47 -05008721 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008722 return -1;
8723 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 /* find all unencodable characters */
8725 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008726 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008727 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008728 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008729 val = encoding_map_lookup(ch, mapping);
8730 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 break;
8732 ++collendpos;
8733 continue;
8734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008735
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008736 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8737 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 if (rep==NULL)
8739 return -1;
8740 else if (rep!=Py_None) {
8741 Py_DECREF(rep);
8742 break;
8743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008744 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 }
8747 /* cache callback name lookup
8748 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008749 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008750 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008751
8752 switch (*error_handler) {
8753 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008754 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008755 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008756
8757 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008758 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 x = charmapencode_output('?', mapping, res, respos);
8760 if (x==enc_EXCEPTION) {
8761 return -1;
8762 }
8763 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008764 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 return -1;
8766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008767 }
8768 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008769 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 *inpos = collendpos;
8771 break;
Victor Stinner50149202015-09-22 00:26:54 +02008772
8773 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008774 /* generate replacement (temporarily (mis)uses p) */
8775 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 char buffer[2+29+1+1];
8777 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008778 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 for (cp = buffer; *cp; ++cp) {
8780 x = charmapencode_output(*cp, mapping, res, respos);
8781 if (x==enc_EXCEPTION)
8782 return -1;
8783 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008784 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 return -1;
8786 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008787 }
8788 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008789 *inpos = collendpos;
8790 break;
Victor Stinner50149202015-09-22 00:26:54 +02008791
Benjamin Peterson14339b62009-01-31 16:36:08 +00008792 default:
Victor Stinner50149202015-09-22 00:26:54 +02008793 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008794 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008796 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008798 if (PyBytes_Check(repunicode)) {
8799 /* Directly copy bytes result to output. */
8800 Py_ssize_t outsize = PyBytes_Size(*res);
8801 Py_ssize_t requiredsize;
8802 repsize = PyBytes_Size(repunicode);
8803 requiredsize = *respos + repsize;
8804 if (requiredsize > outsize)
8805 /* Make room for all additional bytes. */
8806 if (charmapencode_resize(res, respos, requiredsize)) {
8807 Py_DECREF(repunicode);
8808 return -1;
8809 }
8810 memcpy(PyBytes_AsString(*res) + *respos,
8811 PyBytes_AsString(repunicode), repsize);
8812 *respos += repsize;
8813 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008814 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008815 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008816 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008818 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008819 Py_DECREF(repunicode);
8820 return -1;
8821 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008822 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008823 data = PyUnicode_DATA(repunicode);
8824 kind = PyUnicode_KIND(repunicode);
8825 for (index = 0; index < repsize; index++) {
8826 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8827 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008829 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return -1;
8831 }
8832 else if (x==enc_FAILED) {
8833 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008834 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 return -1;
8836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008837 }
8838 *inpos = newpos;
8839 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008840 }
8841 return 0;
8842}
8843
Alexander Belopolsky40018472011-02-26 01:02:56 +00008844PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008845_PyUnicode_EncodeCharmap(PyObject *unicode,
8846 PyObject *mapping,
8847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008849 /* output object */
8850 PyObject *res = NULL;
8851 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008852 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008853 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008854 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008855 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008856 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008858 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008859 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008860 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861
Benjamin Petersonbac79492012-01-14 13:34:47 -05008862 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008863 return NULL;
8864 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008865 data = PyUnicode_DATA(unicode);
8866 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008867
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868 /* Default to Latin-1 */
8869 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008870 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872 /* allocate enough for a simple encoding without
8873 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008874 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008875 if (res == NULL)
8876 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008877 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008881 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008883 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 if (x==enc_EXCEPTION) /* error */
8885 goto onError;
8886 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008887 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008889 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 &res, &respos)) {
8891 goto onError;
8892 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008893 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 else
8895 /* done with this character => adjust input position */
8896 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008899 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008900 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008901 if (_PyBytes_Resize(&res, respos) < 0)
8902 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008903
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008904 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008905 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008906 return res;
8907
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008909 Py_XDECREF(res);
8910 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008911 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 return NULL;
8913}
8914
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008915/* Deprecated */
8916PyObject *
8917PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8918 Py_ssize_t size,
8919 PyObject *mapping,
8920 const char *errors)
8921{
8922 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008923 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008924 if (unicode == NULL)
8925 return NULL;
8926 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8927 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008928 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008929}
8930
Alexander Belopolsky40018472011-02-26 01:02:56 +00008931PyObject *
8932PyUnicode_AsCharmapString(PyObject *unicode,
8933 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934{
8935 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 PyErr_BadArgument();
8937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008939 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940}
8941
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008942/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008943static void
8944make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008946 Py_ssize_t startpos, Py_ssize_t endpos,
8947 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008949 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 *exceptionObject = _PyUnicodeTranslateError_Create(
8951 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 }
8953 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8955 goto onError;
8956 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8957 goto onError;
8958 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8959 goto onError;
8960 return;
8961 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008962 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 }
8964}
8965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008966/* error handling callback helper:
8967 build arguments, call the callback and check the arguments,
8968 put the result into newpos and return the replacement string, which
8969 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008970static PyObject *
8971unicode_translate_call_errorhandler(const char *errors,
8972 PyObject **errorHandler,
8973 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008975 Py_ssize_t startpos, Py_ssize_t endpos,
8976 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008977{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008978 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008979
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008980 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008981 PyObject *restuple;
8982 PyObject *resunicode;
8983
8984 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008986 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008988 }
8989
8990 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008992 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008994
Petr Viktorinffd97532020-02-11 17:46:57 +01008995 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008996 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008998 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008999 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 Py_DECREF(restuple);
9001 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009002 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009003 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 &resunicode, &i_newpos)) {
9005 Py_DECREF(restuple);
9006 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009007 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009008 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009010 else
9011 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009013 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 Py_DECREF(restuple);
9015 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009016 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009017 Py_INCREF(resunicode);
9018 Py_DECREF(restuple);
9019 return resunicode;
9020}
9021
9022/* Lookup the character ch in the mapping and put the result in result,
9023 which must be decrefed by the caller.
9024 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009025static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027{
Christian Heimes217cfd12007-12-02 14:31:20 +00009028 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009029 PyObject *x;
9030
9031 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033 x = PyObject_GetItem(mapping, w);
9034 Py_DECREF(w);
9035 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9037 /* No mapping found means: use 1:1 mapping. */
9038 PyErr_Clear();
9039 *result = NULL;
9040 return 0;
9041 } else
9042 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009043 }
9044 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 *result = x;
9046 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009048 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009050 if (value < 0 || value > MAX_UNICODE) {
9051 PyErr_Format(PyExc_ValueError,
9052 "character mapping must be in range(0x%x)",
9053 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009054 Py_DECREF(x);
9055 return -1;
9056 }
9057 *result = x;
9058 return 0;
9059 }
9060 else if (PyUnicode_Check(x)) {
9061 *result = x;
9062 return 0;
9063 }
9064 else {
9065 /* wrong return value */
9066 PyErr_SetString(PyExc_TypeError,
9067 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009068 Py_DECREF(x);
9069 return -1;
9070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009071}
Victor Stinner1194ea02014-04-04 19:37:40 +02009072
9073/* lookup the character, write the result into the writer.
9074 Return 1 if the result was written into the writer, return 0 if the mapping
9075 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009076static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009077charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9078 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009079{
Victor Stinner1194ea02014-04-04 19:37:40 +02009080 PyObject *item;
9081
9082 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009084
9085 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009087 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009090 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009091 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009092
9093 if (item == Py_None) {
9094 Py_DECREF(item);
9095 return 0;
9096 }
9097
9098 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009099 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9100 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9101 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009102 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9103 Py_DECREF(item);
9104 return -1;
9105 }
9106 Py_DECREF(item);
9107 return 1;
9108 }
9109
9110 if (!PyUnicode_Check(item)) {
9111 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009112 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009113 }
9114
9115 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9116 Py_DECREF(item);
9117 return -1;
9118 }
9119
9120 Py_DECREF(item);
9121 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009122}
9123
Victor Stinner89a76ab2014-04-05 11:44:04 +02009124static int
9125unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9126 Py_UCS1 *translate)
9127{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009128 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009129 int ret = 0;
9130
Victor Stinner89a76ab2014-04-05 11:44:04 +02009131 if (charmaptranslate_lookup(ch, mapping, &item)) {
9132 return -1;
9133 }
9134
9135 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009136 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009137 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009138 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009139 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009140 /* not found => default to 1:1 mapping */
9141 translate[ch] = ch;
9142 return 1;
9143 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009144 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009145 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009146 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9147 used it */
9148 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009149 /* invalid character or character outside ASCII:
9150 skip the fast translate */
9151 goto exit;
9152 }
9153 translate[ch] = (Py_UCS1)replace;
9154 }
9155 else if (PyUnicode_Check(item)) {
9156 Py_UCS4 replace;
9157
9158 if (PyUnicode_READY(item) == -1) {
9159 Py_DECREF(item);
9160 return -1;
9161 }
9162 if (PyUnicode_GET_LENGTH(item) != 1)
9163 goto exit;
9164
9165 replace = PyUnicode_READ_CHAR(item, 0);
9166 if (replace > 127)
9167 goto exit;
9168 translate[ch] = (Py_UCS1)replace;
9169 }
9170 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009171 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009172 goto exit;
9173 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009174 ret = 1;
9175
Benjamin Peterson1365de72014-04-07 20:15:41 -04009176 exit:
9177 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009178 return ret;
9179}
9180
9181/* Fast path for ascii => ascii translation. Return 1 if the whole string
9182 was translated into writer, return 0 if the input string was partially
9183 translated into writer, raise an exception and return -1 on error. */
9184static int
9185unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009186 _PyUnicodeWriter *writer, int ignore,
9187 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009188{
Victor Stinner872b2912014-04-05 14:27:07 +02009189 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009190 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009191 const Py_UCS1 *in, *end;
9192 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009193 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009194
Victor Stinner89a76ab2014-04-05 11:44:04 +02009195 len = PyUnicode_GET_LENGTH(input);
9196
Victor Stinner872b2912014-04-05 14:27:07 +02009197 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009198
9199 in = PyUnicode_1BYTE_DATA(input);
9200 end = in + len;
9201
9202 assert(PyUnicode_IS_ASCII(writer->buffer));
9203 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9204 out = PyUnicode_1BYTE_DATA(writer->buffer);
9205
Victor Stinner872b2912014-04-05 14:27:07 +02009206 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009207 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009208 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009209 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009210 int translate = unicode_fast_translate_lookup(mapping, ch,
9211 ascii_table);
9212 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009213 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009214 if (translate == 0)
9215 goto exit;
9216 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009217 }
Victor Stinner872b2912014-04-05 14:27:07 +02009218 if (ch2 == 0xfe) {
9219 if (ignore)
9220 continue;
9221 goto exit;
9222 }
9223 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009224 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009225 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009226 }
Victor Stinner872b2912014-04-05 14:27:07 +02009227 res = 1;
9228
9229exit:
9230 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009231 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009232 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009233}
9234
Victor Stinner3222da22015-10-01 22:07:32 +02009235static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236_PyUnicode_TranslateCharmap(PyObject *input,
9237 PyObject *mapping,
9238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009241 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 Py_ssize_t size, i;
9243 int kind;
9244 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009245 _PyUnicodeWriter writer;
9246 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009247 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009248 PyObject *errorHandler = NULL;
9249 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009250 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009251 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 PyErr_BadArgument();
9255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 if (PyUnicode_READY(input) == -1)
9259 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009260 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 kind = PyUnicode_KIND(input);
9262 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009264 if (size == 0)
9265 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009267 /* allocate enough for a simple 1:1 translation without
9268 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009269 _PyUnicodeWriter_Init(&writer);
9270 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009271 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272
Victor Stinner872b2912014-04-05 14:27:07 +02009273 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9274
Victor Stinner33798672016-03-01 21:59:58 +01009275 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009276 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009277 if (PyUnicode_IS_ASCII(input)) {
9278 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9279 if (res < 0) {
9280 _PyUnicodeWriter_Dealloc(&writer);
9281 return NULL;
9282 }
9283 if (res == 1)
9284 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009285 }
Victor Stinner33798672016-03-01 21:59:58 +01009286 else {
9287 i = 0;
9288 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009292 int translate;
9293 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9294 Py_ssize_t newpos;
9295 /* startpos for collecting untranslatable chars */
9296 Py_ssize_t collstart;
9297 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009298 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299
Victor Stinner1194ea02014-04-04 19:37:40 +02009300 ch = PyUnicode_READ(kind, data, i);
9301 translate = charmaptranslate_output(ch, mapping, &writer);
9302 if (translate < 0)
9303 goto onError;
9304
9305 if (translate != 0) {
9306 /* it worked => adjust input pointer */
9307 ++i;
9308 continue;
9309 }
9310
9311 /* untranslatable character */
9312 collstart = i;
9313 collend = i+1;
9314
9315 /* find all untranslatable characters */
9316 while (collend < size) {
9317 PyObject *x;
9318 ch = PyUnicode_READ(kind, data, collend);
9319 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009320 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009321 Py_XDECREF(x);
9322 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009324 ++collend;
9325 }
9326
9327 if (ignore) {
9328 i = collend;
9329 }
9330 else {
9331 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9332 reason, input, &exc,
9333 collstart, collend, &newpos);
9334 if (repunicode == NULL)
9335 goto onError;
9336 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009338 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009339 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009340 Py_DECREF(repunicode);
9341 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009342 }
9343 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009344 Py_XDECREF(exc);
9345 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009346 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009349 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009350 Py_XDECREF(exc);
9351 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 return NULL;
9353}
9354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355/* Deprecated. Use PyUnicode_Translate instead. */
9356PyObject *
9357PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9358 Py_ssize_t size,
9359 PyObject *mapping,
9360 const char *errors)
9361{
Christian Heimes5f520f42012-09-11 14:03:25 +02009362 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009363 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 if (!unicode)
9365 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009366 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9367 Py_DECREF(unicode);
9368 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369}
9370
Alexander Belopolsky40018472011-02-26 01:02:56 +00009371PyObject *
9372PyUnicode_Translate(PyObject *str,
9373 PyObject *mapping,
9374 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009376 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009377 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009378 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379}
Tim Petersced69f82003-09-16 20:30:58 +00009380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381PyObject *
9382_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9383{
9384 if (!PyUnicode_Check(unicode)) {
9385 PyErr_BadInternalCall();
9386 return NULL;
9387 }
9388 if (PyUnicode_READY(unicode) == -1)
9389 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009390 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 /* If the string is already ASCII, just return the same string */
9392 Py_INCREF(unicode);
9393 return unicode;
9394 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009395
9396 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9397 PyObject *result = PyUnicode_New(len, 127);
9398 if (result == NULL) {
9399 return NULL;
9400 }
9401
9402 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9403 int kind = PyUnicode_KIND(unicode);
9404 const void *data = PyUnicode_DATA(unicode);
9405 Py_ssize_t i;
9406 for (i = 0; i < len; ++i) {
9407 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9408 if (ch < 127) {
9409 out[i] = ch;
9410 }
9411 else if (Py_UNICODE_ISSPACE(ch)) {
9412 out[i] = ' ';
9413 }
9414 else {
9415 int decimal = Py_UNICODE_TODECIMAL(ch);
9416 if (decimal < 0) {
9417 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009418 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009419 _PyUnicode_LENGTH(result) = i + 1;
9420 break;
9421 }
9422 out[i] = '0' + decimal;
9423 }
9424 }
9425
INADA Naoki16dfca42018-07-14 12:06:43 +09009426 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009427 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428}
9429
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009430PyObject *
9431PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9432 Py_ssize_t length)
9433{
Victor Stinnerf0124502011-11-21 23:12:56 +01009434 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009435 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009436 Py_UCS4 maxchar;
9437 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009438 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009439
Victor Stinner99d7ad02012-02-22 13:37:39 +01009440 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009441 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009442 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009443 if (ch > 127) {
9444 int decimal = Py_UNICODE_TODECIMAL(ch);
9445 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009446 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009447 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009448 }
9449 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009450
9451 /* Copy to a new string */
9452 decimal = PyUnicode_New(length, maxchar);
9453 if (decimal == NULL)
9454 return decimal;
9455 kind = PyUnicode_KIND(decimal);
9456 data = PyUnicode_DATA(decimal);
9457 /* Iterate over code points */
9458 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009459 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009460 if (ch > 127) {
9461 int decimal = Py_UNICODE_TODECIMAL(ch);
9462 if (decimal >= 0)
9463 ch = '0' + decimal;
9464 }
9465 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009467 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009468}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009469/* --- Decimal Encoder ---------------------------------------------------- */
9470
Alexander Belopolsky40018472011-02-26 01:02:56 +00009471int
9472PyUnicode_EncodeDecimal(Py_UNICODE *s,
9473 Py_ssize_t length,
9474 char *output,
9475 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009476{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009477 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009478 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009479 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009480 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009481
9482 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 PyErr_BadArgument();
9484 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009485 }
9486
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009487 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009488 if (unicode == NULL)
9489 return -1;
9490
Victor Stinner42bf7752011-11-21 22:52:58 +01009491 kind = PyUnicode_KIND(unicode);
9492 data = PyUnicode_DATA(unicode);
9493
Victor Stinnerb84d7232011-11-22 01:50:07 +01009494 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009495 PyObject *exc;
9496 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009498 Py_ssize_t startpos;
9499
9500 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009501
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009503 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009504 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009506 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 decimal = Py_UNICODE_TODECIMAL(ch);
9508 if (decimal >= 0) {
9509 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009510 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 continue;
9512 }
9513 if (0 < ch && ch < 256) {
9514 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009515 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 continue;
9517 }
Victor Stinner6345be92011-11-25 20:09:01 +01009518
Victor Stinner42bf7752011-11-21 22:52:58 +01009519 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009520 exc = NULL;
9521 raise_encode_exception(&exc, "decimal", unicode,
9522 startpos, startpos+1,
9523 "invalid decimal Unicode string");
9524 Py_XDECREF(exc);
9525 Py_DECREF(unicode);
9526 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009527 }
9528 /* 0-terminate the output string */
9529 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009530 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009531 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009532}
9533
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534/* --- Helpers ------------------------------------------------------------ */
9535
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009536/* helper macro to fixup start/end slice values */
9537#define ADJUST_INDICES(start, end, len) \
9538 if (end > len) \
9539 end = len; \
9540 else if (end < 0) { \
9541 end += len; \
9542 if (end < 0) \
9543 end = 0; \
9544 } \
9545 if (start < 0) { \
9546 start += len; \
9547 if (start < 0) \
9548 start = 0; \
9549 }
9550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009552any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009554 Py_ssize_t end,
9555 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009557 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009558 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 Py_ssize_t len1, len2, result;
9560
9561 kind1 = PyUnicode_KIND(s1);
9562 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009563 if (kind1 < kind2)
9564 return -1;
9565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 len1 = PyUnicode_GET_LENGTH(s1);
9567 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009568 ADJUST_INDICES(start, end, len1);
9569 if (end - start < len2)
9570 return -1;
9571
9572 buf1 = PyUnicode_DATA(s1);
9573 buf2 = PyUnicode_DATA(s2);
9574 if (len2 == 1) {
9575 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9576 result = findchar((const char *)buf1 + kind1*start,
9577 kind1, end - start, ch, direction);
9578 if (result == -1)
9579 return -1;
9580 else
9581 return start + result;
9582 }
9583
9584 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009585 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009586 if (!buf2)
9587 return -2;
9588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589
Victor Stinner794d5672011-10-10 03:21:36 +02009590 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009591 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009592 case PyUnicode_1BYTE_KIND:
9593 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9594 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9595 else
9596 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9597 break;
9598 case PyUnicode_2BYTE_KIND:
9599 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9600 break;
9601 case PyUnicode_4BYTE_KIND:
9602 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9603 break;
9604 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009605 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009606 }
9607 }
9608 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009609 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009610 case PyUnicode_1BYTE_KIND:
9611 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9612 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9613 else
9614 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9615 break;
9616 case PyUnicode_2BYTE_KIND:
9617 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9618 break;
9619 case PyUnicode_4BYTE_KIND:
9620 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9621 break;
9622 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009623 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 }
9626
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009627 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009628 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009629 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630
9631 return result;
9632}
9633
Victor Stinner59423e32018-11-26 13:40:01 +01009634/* _PyUnicode_InsertThousandsGrouping() helper functions */
9635#include "stringlib/localeutil.h"
9636
9637/**
9638 * InsertThousandsGrouping:
9639 * @writer: Unicode writer.
9640 * @n_buffer: Number of characters in @buffer.
9641 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9642 * @d_pos: Start of digits string.
9643 * @n_digits: The number of digits in the string, in which we want
9644 * to put the grouping chars.
9645 * @min_width: The minimum width of the digits in the output string.
9646 * Output will be zero-padded on the left to fill.
9647 * @grouping: see definition in localeconv().
9648 * @thousands_sep: see definition in localeconv().
9649 *
9650 * There are 2 modes: counting and filling. If @writer is NULL,
9651 * we are in counting mode, else filling mode.
9652 * If counting, the required buffer size is returned.
9653 * If filling, we know the buffer will be large enough, so we don't
9654 * need to pass in the buffer size.
9655 * Inserts thousand grouping characters (as defined by grouping and
9656 * thousands_sep) into @writer.
9657 *
9658 * Return value: -1 on error, number of characters otherwise.
9659 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009661_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009662 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009663 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009664 PyObject *digits,
9665 Py_ssize_t d_pos,
9666 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009667 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009668 const char *grouping,
9669 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009670 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671{
Xtreak3f7983a2019-01-07 20:39:14 +05309672 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009673 if (writer) {
9674 assert(digits != NULL);
9675 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009676 }
9677 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009678 assert(digits == NULL);
9679 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009680 }
Victor Stinner59423e32018-11-26 13:40:01 +01009681 assert(0 <= d_pos);
9682 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009683 assert(grouping != NULL);
9684
9685 if (digits != NULL) {
9686 if (PyUnicode_READY(digits) == -1) {
9687 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009688 }
Victor Stinner59423e32018-11-26 13:40:01 +01009689 }
9690 if (PyUnicode_READY(thousands_sep) == -1) {
9691 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009692 }
9693
Victor Stinner59423e32018-11-26 13:40:01 +01009694 Py_ssize_t count = 0;
9695 Py_ssize_t n_zeros;
9696 int loop_broken = 0;
9697 int use_separator = 0; /* First time through, don't append the
9698 separator. They only go between
9699 groups. */
9700 Py_ssize_t buffer_pos;
9701 Py_ssize_t digits_pos;
9702 Py_ssize_t len;
9703 Py_ssize_t n_chars;
9704 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9705 be looked at */
9706 /* A generator that returns all of the grouping widths, until it
9707 returns 0. */
9708 GroupGenerator groupgen;
9709 GroupGenerator_init(&groupgen, grouping);
9710 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9711
9712 /* if digits are not grouped, thousands separator
9713 should be an empty string */
9714 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9715
9716 digits_pos = d_pos + n_digits;
9717 if (writer) {
9718 buffer_pos = writer->pos + n_buffer;
9719 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9720 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 }
Victor Stinner59423e32018-11-26 13:40:01 +01009722 else {
9723 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009724 }
Victor Stinner59423e32018-11-26 13:40:01 +01009725
9726 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009727 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009728 }
Victor Stinner59423e32018-11-26 13:40:01 +01009729
9730 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9731 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9732 n_zeros = Py_MAX(0, len - remaining);
9733 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9734
9735 /* Use n_zero zero's and n_chars chars */
9736
9737 /* Count only, don't do anything. */
9738 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9739
9740 /* Copy into the writer. */
9741 InsertThousandsGrouping_fill(writer, &buffer_pos,
9742 digits, &digits_pos,
9743 n_chars, n_zeros,
9744 use_separator ? thousands_sep : NULL,
9745 thousands_sep_len, maxchar);
9746
9747 /* Use a separator next time. */
9748 use_separator = 1;
9749
9750 remaining -= n_chars;
9751 min_width -= len;
9752
9753 if (remaining <= 0 && min_width <= 0) {
9754 loop_broken = 1;
9755 break;
9756 }
9757 min_width -= thousands_sep_len;
9758 }
9759 if (!loop_broken) {
9760 /* We left the loop without using a break statement. */
9761
9762 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9763 n_zeros = Py_MAX(0, len - remaining);
9764 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9765
9766 /* Use n_zero zero's and n_chars chars */
9767 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9768
9769 /* Copy into the writer. */
9770 InsertThousandsGrouping_fill(writer, &buffer_pos,
9771 digits, &digits_pos,
9772 n_chars, n_zeros,
9773 use_separator ? thousands_sep : NULL,
9774 thousands_sep_len, maxchar);
9775 }
9776 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777}
9778
9779
Alexander Belopolsky40018472011-02-26 01:02:56 +00009780Py_ssize_t
9781PyUnicode_Count(PyObject *str,
9782 PyObject *substr,
9783 Py_ssize_t start,
9784 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009786 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009787 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009788 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009790
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009791 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009792 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009793
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009794 kind1 = PyUnicode_KIND(str);
9795 kind2 = PyUnicode_KIND(substr);
9796 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009797 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009798
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009799 len1 = PyUnicode_GET_LENGTH(str);
9800 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009802 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009803 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009804
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009805 buf1 = PyUnicode_DATA(str);
9806 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009807 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009808 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009809 if (!buf2)
9810 goto onError;
9811 }
9812
9813 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009815 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009816 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009817 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 buf2, len2, PY_SSIZE_T_MAX
9819 );
9820 else
9821 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009822 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009823 buf2, len2, PY_SSIZE_T_MAX
9824 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 break;
9826 case PyUnicode_2BYTE_KIND:
9827 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009828 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 buf2, len2, PY_SSIZE_T_MAX
9830 );
9831 break;
9832 case PyUnicode_4BYTE_KIND:
9833 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009834 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 buf2, len2, PY_SSIZE_T_MAX
9836 );
9837 break;
9838 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009839 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009841
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009842 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009843 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009844 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009848 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9849 if (kind2 != kind1)
9850 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852}
9853
Alexander Belopolsky40018472011-02-26 01:02:56 +00009854Py_ssize_t
9855PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009856 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009857 Py_ssize_t start,
9858 Py_ssize_t end,
9859 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009861 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009862 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009863
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009864 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865}
9866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867Py_ssize_t
9868PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9869 Py_ssize_t start, Py_ssize_t end,
9870 int direction)
9871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009873 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 if (PyUnicode_READY(str) == -1)
9875 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009876 len = PyUnicode_GET_LENGTH(str);
9877 ADJUST_INDICES(start, end, len);
9878 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009879 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009881 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9882 kind, end-start, ch, direction);
9883 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009885 else
9886 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887}
9888
Alexander Belopolsky40018472011-02-26 01:02:56 +00009889static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009890tailmatch(PyObject *self,
9891 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009892 Py_ssize_t start,
9893 Py_ssize_t end,
9894 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 int kind_self;
9897 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009898 const void *data_self;
9899 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 Py_ssize_t offset;
9901 Py_ssize_t i;
9902 Py_ssize_t end_sub;
9903
9904 if (PyUnicode_READY(self) == -1 ||
9905 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009906 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9909 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009911 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009913 if (PyUnicode_GET_LENGTH(substring) == 0)
9914 return 1;
9915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 kind_self = PyUnicode_KIND(self);
9917 data_self = PyUnicode_DATA(self);
9918 kind_sub = PyUnicode_KIND(substring);
9919 data_sub = PyUnicode_DATA(substring);
9920 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9921
9922 if (direction > 0)
9923 offset = end;
9924 else
9925 offset = start;
9926
9927 if (PyUnicode_READ(kind_self, data_self, offset) ==
9928 PyUnicode_READ(kind_sub, data_sub, 0) &&
9929 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9930 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9931 /* If both are of the same kind, memcmp is sufficient */
9932 if (kind_self == kind_sub) {
9933 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009934 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 data_sub,
9936 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009937 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009939 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 else {
9941 /* We do not need to compare 0 and len(substring)-1 because
9942 the if statement above ensured already that they are equal
9943 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 for (i = 1; i < end_sub; ++i) {
9945 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9946 PyUnicode_READ(kind_sub, data_sub, i))
9947 return 0;
9948 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009949 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951 }
9952
9953 return 0;
9954}
9955
Alexander Belopolsky40018472011-02-26 01:02:56 +00009956Py_ssize_t
9957PyUnicode_Tailmatch(PyObject *str,
9958 PyObject *substr,
9959 Py_ssize_t start,
9960 Py_ssize_t end,
9961 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009963 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009965
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009966 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967}
9968
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009969static PyObject *
9970ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009972 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009973 const char *data = PyUnicode_DATA(self);
9974 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009975 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009976
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009977 res = PyUnicode_New(len, 127);
9978 if (res == NULL)
9979 return NULL;
9980 resdata = PyUnicode_DATA(res);
9981 if (lower)
9982 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009984 _Py_bytes_upper(resdata, data, len);
9985 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986}
9987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009989handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009991 Py_ssize_t j;
9992 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009993 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009994 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009995
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009996 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9997
9998 where ! is a negation and \p{xxx} is a character with property xxx.
9999 */
10000 for (j = i - 1; j >= 0; j--) {
10001 c = PyUnicode_READ(kind, data, j);
10002 if (!_PyUnicode_IsCaseIgnorable(c))
10003 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010005 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10006 if (final_sigma) {
10007 for (j = i + 1; j < length; j++) {
10008 c = PyUnicode_READ(kind, data, j);
10009 if (!_PyUnicode_IsCaseIgnorable(c))
10010 break;
10011 }
10012 final_sigma = j == length || !_PyUnicode_IsCased(c);
10013 }
10014 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015}
10016
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010017static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010018lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010019 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010021 /* Obscure special case. */
10022 if (c == 0x3A3) {
10023 mapped[0] = handle_capital_sigma(kind, data, length, i);
10024 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010026 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027}
10028
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010029static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010030do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010032 Py_ssize_t i, k = 0;
10033 int n_res, j;
10034 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010035
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010036 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010037 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010038 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010039 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010040 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010042 for (i = 1; i < length; i++) {
10043 c = PyUnicode_READ(kind, data, i);
10044 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10045 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010046 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010047 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010048 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010049 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010050 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051}
10052
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010053static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010054do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010055 Py_ssize_t i, k = 0;
10056
10057 for (i = 0; i < length; i++) {
10058 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10059 int n_res, j;
10060 if (Py_UNICODE_ISUPPER(c)) {
10061 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10062 }
10063 else if (Py_UNICODE_ISLOWER(c)) {
10064 n_res = _PyUnicode_ToUpperFull(c, mapped);
10065 }
10066 else {
10067 n_res = 1;
10068 mapped[0] = c;
10069 }
10070 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010071 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010072 res[k++] = mapped[j];
10073 }
10074 }
10075 return k;
10076}
10077
10078static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010079do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010080 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010082 Py_ssize_t i, k = 0;
10083
10084 for (i = 0; i < length; i++) {
10085 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10086 int n_res, j;
10087 if (lower)
10088 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10089 else
10090 n_res = _PyUnicode_ToUpperFull(c, mapped);
10091 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010092 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010093 res[k++] = mapped[j];
10094 }
10095 }
10096 return k;
10097}
10098
10099static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010100do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010101{
10102 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10103}
10104
10105static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010106do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010107{
10108 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10109}
10110
Benjamin Petersone51757f2012-01-12 21:10:29 -050010111static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010112do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010113{
10114 Py_ssize_t i, k = 0;
10115
10116 for (i = 0; i < length; i++) {
10117 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10118 Py_UCS4 mapped[3];
10119 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10120 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010121 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010122 res[k++] = mapped[j];
10123 }
10124 }
10125 return k;
10126}
10127
10128static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010129do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010130{
10131 Py_ssize_t i, k = 0;
10132 int previous_is_cased;
10133
10134 previous_is_cased = 0;
10135 for (i = 0; i < length; i++) {
10136 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10137 Py_UCS4 mapped[3];
10138 int n_res, j;
10139
10140 if (previous_is_cased)
10141 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10142 else
10143 n_res = _PyUnicode_ToTitleFull(c, mapped);
10144
10145 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010146 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010147 res[k++] = mapped[j];
10148 }
10149
10150 previous_is_cased = _PyUnicode_IsCased(c);
10151 }
10152 return k;
10153}
10154
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010155static PyObject *
10156case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010157 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010158{
10159 PyObject *res = NULL;
10160 Py_ssize_t length, newlength = 0;
10161 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010162 const void *data;
10163 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010164 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10165
Benjamin Petersoneea48462012-01-16 14:28:50 -050010166 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010167
10168 kind = PyUnicode_KIND(self);
10169 data = PyUnicode_DATA(self);
10170 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010171 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010172 PyErr_SetString(PyExc_OverflowError, "string is too long");
10173 return NULL;
10174 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010175 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010176 if (tmp == NULL)
10177 return PyErr_NoMemory();
10178 newlength = perform(kind, data, length, tmp, &maxchar);
10179 res = PyUnicode_New(newlength, maxchar);
10180 if (res == NULL)
10181 goto leave;
10182 tmpend = tmp + newlength;
10183 outdata = PyUnicode_DATA(res);
10184 outkind = PyUnicode_KIND(res);
10185 switch (outkind) {
10186 case PyUnicode_1BYTE_KIND:
10187 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10188 break;
10189 case PyUnicode_2BYTE_KIND:
10190 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10191 break;
10192 case PyUnicode_4BYTE_KIND:
10193 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10194 break;
10195 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010196 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010197 }
10198 leave:
10199 PyMem_FREE(tmp);
10200 return res;
10201}
10202
Tim Peters8ce9f162004-08-27 01:49:32 +000010203PyObject *
10204PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010206 PyObject *res;
10207 PyObject *fseq;
10208 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010209 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010211 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010212 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010213 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010214 }
10215
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010216 /* NOTE: the following code can't call back into Python code,
10217 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010218 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010219
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010220 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010221 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010222 res = _PyUnicode_JoinArray(separator, items, seqlen);
10223 Py_DECREF(fseq);
10224 return res;
10225}
10226
10227PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010228_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010229{
10230 PyObject *res = NULL; /* the result */
10231 PyObject *sep = NULL;
10232 Py_ssize_t seplen;
10233 PyObject *item;
10234 Py_ssize_t sz, i, res_offset;
10235 Py_UCS4 maxchar;
10236 Py_UCS4 item_maxchar;
10237 int use_memcpy;
10238 unsigned char *res_data = NULL, *sep_data = NULL;
10239 PyObject *last_obj;
10240 unsigned int kind = 0;
10241
Tim Peters05eba1f2004-08-27 21:32:02 +000010242 /* If empty sequence, return u"". */
10243 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010244 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010245 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010246
Tim Peters05eba1f2004-08-27 21:32:02 +000010247 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010248 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010249 if (seqlen == 1) {
10250 if (PyUnicode_CheckExact(items[0])) {
10251 res = items[0];
10252 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010253 return res;
10254 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010255 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010256 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010257 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010258 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010259 /* Set up sep and seplen */
10260 if (separator == NULL) {
10261 /* fall back to a blank space separator */
10262 sep = PyUnicode_FromOrdinal(' ');
10263 if (!sep)
10264 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010265 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010266 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010267 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010268 else {
10269 if (!PyUnicode_Check(separator)) {
10270 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010271 "separator: expected str instance,"
10272 " %.80s found",
10273 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010274 goto onError;
10275 }
10276 if (PyUnicode_READY(separator))
10277 goto onError;
10278 sep = separator;
10279 seplen = PyUnicode_GET_LENGTH(separator);
10280 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10281 /* inc refcount to keep this code path symmetric with the
10282 above case of a blank separator */
10283 Py_INCREF(sep);
10284 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010285 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010286 }
10287
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010288 /* There are at least two things to join, or else we have a subclass
10289 * of str in the sequence.
10290 * Do a pre-pass to figure out the total amount of space we'll
10291 * need (sz), and see whether all argument are strings.
10292 */
10293 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010294#ifdef Py_DEBUG
10295 use_memcpy = 0;
10296#else
10297 use_memcpy = 1;
10298#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010299 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010300 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010301 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010302 if (!PyUnicode_Check(item)) {
10303 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010304 "sequence item %zd: expected str instance,"
10305 " %.80s found",
10306 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 goto onError;
10308 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (PyUnicode_READY(item) == -1)
10310 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010311 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010313 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010314 if (i != 0) {
10315 add_sz += seplen;
10316 }
10317 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010318 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010319 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010320 goto onError;
10321 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010322 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010323 if (use_memcpy && last_obj != NULL) {
10324 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10325 use_memcpy = 0;
10326 }
10327 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010328 }
Tim Petersced69f82003-09-16 20:30:58 +000010329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010331 if (res == NULL)
10332 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010333
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010334 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010335#ifdef Py_DEBUG
10336 use_memcpy = 0;
10337#else
10338 if (use_memcpy) {
10339 res_data = PyUnicode_1BYTE_DATA(res);
10340 kind = PyUnicode_KIND(res);
10341 if (seplen != 0)
10342 sep_data = PyUnicode_1BYTE_DATA(sep);
10343 }
10344#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010345 if (use_memcpy) {
10346 for (i = 0; i < seqlen; ++i) {
10347 Py_ssize_t itemlen;
10348 item = items[i];
10349
10350 /* Copy item, and maybe the separator. */
10351 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010352 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010353 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 kind * seplen);
10355 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010356 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010357
10358 itemlen = PyUnicode_GET_LENGTH(item);
10359 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010360 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010361 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010362 kind * itemlen);
10363 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010364 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010365 }
10366 assert(res_data == PyUnicode_1BYTE_DATA(res)
10367 + kind * PyUnicode_GET_LENGTH(res));
10368 }
10369 else {
10370 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10371 Py_ssize_t itemlen;
10372 item = items[i];
10373
10374 /* Copy item, and maybe the separator. */
10375 if (i && seplen != 0) {
10376 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10377 res_offset += seplen;
10378 }
10379
10380 itemlen = PyUnicode_GET_LENGTH(item);
10381 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010382 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010383 res_offset += itemlen;
10384 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010385 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010386 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010387 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010390 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Benjamin Peterson29060642009-01-31 22:14:21 +000010393 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010395 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396 return NULL;
10397}
10398
Victor Stinnerd3f08822012-05-29 12:57:52 +020010399void
10400_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10401 Py_UCS4 fill_char)
10402{
10403 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010404 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010405 assert(PyUnicode_IS_READY(unicode));
10406 assert(unicode_modifiable(unicode));
10407 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10408 assert(start >= 0);
10409 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010410 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010411}
10412
Victor Stinner3fe55312012-01-04 00:33:50 +010010413Py_ssize_t
10414PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10415 Py_UCS4 fill_char)
10416{
10417 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010418
10419 if (!PyUnicode_Check(unicode)) {
10420 PyErr_BadInternalCall();
10421 return -1;
10422 }
10423 if (PyUnicode_READY(unicode) == -1)
10424 return -1;
10425 if (unicode_check_modifiable(unicode))
10426 return -1;
10427
Victor Stinnerd3f08822012-05-29 12:57:52 +020010428 if (start < 0) {
10429 PyErr_SetString(PyExc_IndexError, "string index out of range");
10430 return -1;
10431 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010432 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10433 PyErr_SetString(PyExc_ValueError,
10434 "fill character is bigger than "
10435 "the string maximum character");
10436 return -1;
10437 }
10438
10439 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10440 length = Py_MIN(maxlen, length);
10441 if (length <= 0)
10442 return 0;
10443
Victor Stinnerd3f08822012-05-29 12:57:52 +020010444 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010445 return length;
10446}
10447
Victor Stinner9310abb2011-10-05 00:59:23 +020010448static PyObject *
10449pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010450 Py_ssize_t left,
10451 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 PyObject *u;
10455 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010456 int kind;
10457 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458
10459 if (left < 0)
10460 left = 0;
10461 if (right < 0)
10462 right = 0;
10463
Victor Stinnerc4b49542011-12-11 22:44:26 +010010464 if (left == 0 && right == 0)
10465 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10468 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010469 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10470 return NULL;
10471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010473 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010475 if (!u)
10476 return NULL;
10477
10478 kind = PyUnicode_KIND(u);
10479 data = PyUnicode_DATA(u);
10480 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010481 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010482 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010483 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010484 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010485 assert(_PyUnicode_CheckConsistency(u, 1));
10486 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487}
10488
Alexander Belopolsky40018472011-02-26 01:02:56 +000010489PyObject *
10490PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010494 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
Benjamin Petersonead6b532011-12-20 17:23:42 -060010497 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010499 if (PyUnicode_IS_ASCII(string))
10500 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010501 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010502 PyUnicode_GET_LENGTH(string), keepends);
10503 else
10504 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010505 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010506 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 break;
10508 case PyUnicode_2BYTE_KIND:
10509 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010510 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 PyUnicode_GET_LENGTH(string), keepends);
10512 break;
10513 case PyUnicode_4BYTE_KIND:
10514 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010515 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 PyUnicode_GET_LENGTH(string), keepends);
10517 break;
10518 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010519 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522}
10523
Alexander Belopolsky40018472011-02-26 01:02:56 +000010524static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010525split(PyObject *self,
10526 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010527 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010529 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010530 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 Py_ssize_t len1, len2;
10532 PyObject* out;
10533
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010535 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (PyUnicode_READY(self) == -1)
10538 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010541 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010543 if (PyUnicode_IS_ASCII(self))
10544 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010545 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010546 PyUnicode_GET_LENGTH(self), maxcount
10547 );
10548 else
10549 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010550 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010551 PyUnicode_GET_LENGTH(self), maxcount
10552 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 case PyUnicode_2BYTE_KIND:
10554 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010555 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 PyUnicode_GET_LENGTH(self), maxcount
10557 );
10558 case PyUnicode_4BYTE_KIND:
10559 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010560 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 PyUnicode_GET_LENGTH(self), maxcount
10562 );
10563 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010564 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 }
10566
10567 if (PyUnicode_READY(substring) == -1)
10568 return NULL;
10569
10570 kind1 = PyUnicode_KIND(self);
10571 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 len1 = PyUnicode_GET_LENGTH(self);
10573 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010574 if (kind1 < kind2 || len1 < len2) {
10575 out = PyList_New(1);
10576 if (out == NULL)
10577 return NULL;
10578 Py_INCREF(self);
10579 PyList_SET_ITEM(out, 0, self);
10580 return out;
10581 }
10582 buf1 = PyUnicode_DATA(self);
10583 buf2 = PyUnicode_DATA(substring);
10584 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010585 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010586 if (!buf2)
10587 return NULL;
10588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010590 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10593 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010594 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010595 else
10596 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010597 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 break;
10599 case PyUnicode_2BYTE_KIND:
10600 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010601 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 break;
10603 case PyUnicode_4BYTE_KIND:
10604 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010605 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 break;
10607 default:
10608 out = NULL;
10609 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010610 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010611 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010612 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614}
10615
Alexander Belopolsky40018472011-02-26 01:02:56 +000010616static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010617rsplit(PyObject *self,
10618 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010619 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010620{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010621 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010622 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 Py_ssize_t len1, len2;
10624 PyObject* out;
10625
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010626 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010627 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 if (PyUnicode_READY(self) == -1)
10630 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010633 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010635 if (PyUnicode_IS_ASCII(self))
10636 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010637 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010638 PyUnicode_GET_LENGTH(self), maxcount
10639 );
10640 else
10641 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010642 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010643 PyUnicode_GET_LENGTH(self), maxcount
10644 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 case PyUnicode_2BYTE_KIND:
10646 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010647 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 PyUnicode_GET_LENGTH(self), maxcount
10649 );
10650 case PyUnicode_4BYTE_KIND:
10651 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010652 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 PyUnicode_GET_LENGTH(self), maxcount
10654 );
10655 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010656 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 }
10658
10659 if (PyUnicode_READY(substring) == -1)
10660 return NULL;
10661
10662 kind1 = PyUnicode_KIND(self);
10663 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 len1 = PyUnicode_GET_LENGTH(self);
10665 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010666 if (kind1 < kind2 || len1 < len2) {
10667 out = PyList_New(1);
10668 if (out == NULL)
10669 return NULL;
10670 Py_INCREF(self);
10671 PyList_SET_ITEM(out, 0, self);
10672 return out;
10673 }
10674 buf1 = PyUnicode_DATA(self);
10675 buf2 = PyUnicode_DATA(substring);
10676 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010677 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010678 if (!buf2)
10679 return NULL;
10680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010682 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010684 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10685 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010686 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010687 else
10688 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010689 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 break;
10691 case PyUnicode_2BYTE_KIND:
10692 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010693 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 break;
10695 case PyUnicode_4BYTE_KIND:
10696 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010697 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 break;
10699 default:
10700 out = NULL;
10701 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010702 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010703 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010704 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 return out;
10706}
10707
10708static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010709anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10710 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010712 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010714 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10715 return asciilib_find(buf1, len1, buf2, len2, offset);
10716 else
10717 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 case PyUnicode_2BYTE_KIND:
10719 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10720 case PyUnicode_4BYTE_KIND:
10721 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10722 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010723 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724}
10725
10726static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010727anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10728 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010730 switch (kind) {
10731 case PyUnicode_1BYTE_KIND:
10732 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10733 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10734 else
10735 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10736 case PyUnicode_2BYTE_KIND:
10737 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10738 case PyUnicode_4BYTE_KIND:
10739 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10740 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010741 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010742}
10743
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010744static void
10745replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10746 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10747{
10748 int kind = PyUnicode_KIND(u);
10749 void *data = PyUnicode_DATA(u);
10750 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10751 if (kind == PyUnicode_1BYTE_KIND) {
10752 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10753 (Py_UCS1 *)data + len,
10754 u1, u2, maxcount);
10755 }
10756 else if (kind == PyUnicode_2BYTE_KIND) {
10757 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10758 (Py_UCS2 *)data + len,
10759 u1, u2, maxcount);
10760 }
10761 else {
10762 assert(kind == PyUnicode_4BYTE_KIND);
10763 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10764 (Py_UCS4 *)data + len,
10765 u1, u2, maxcount);
10766 }
10767}
10768
Alexander Belopolsky40018472011-02-26 01:02:56 +000010769static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770replace(PyObject *self, PyObject *str1,
10771 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010774 const char *sbuf = PyUnicode_DATA(self);
10775 const void *buf1 = PyUnicode_DATA(str1);
10776 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 int srelease = 0, release1 = 0, release2 = 0;
10778 int skind = PyUnicode_KIND(self);
10779 int kind1 = PyUnicode_KIND(str1);
10780 int kind2 = PyUnicode_KIND(str2);
10781 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10782 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10783 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010784 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010785 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010787 if (slen < len1)
10788 goto nothing;
10789
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010791 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010792 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010793 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794
Victor Stinner59de0ee2011-10-07 10:01:28 +020010795 if (str1 == str2)
10796 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797
Victor Stinner49a0a212011-10-12 23:46:10 +020010798 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010799 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10800 if (maxchar < maxchar_str1)
10801 /* substring too wide to be present */
10802 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010803 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10804 /* Replacing str1 with str2 may cause a maxchar reduction in the
10805 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010806 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010807 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010810 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010812 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010814 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010815 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010816 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010817
Victor Stinner69ed0f42013-04-09 21:48:24 +020010818 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010819 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010820 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010821 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010822 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010824 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010826
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010827 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10828 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010829 }
10830 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 int rkind = skind;
10832 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010833 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 if (kind1 < rkind) {
10836 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010837 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (!buf1) goto error;
10839 release1 = 1;
10840 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010841 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010842 if (i < 0)
10843 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (rkind > kind2) {
10845 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010846 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 if (!buf2) goto error;
10848 release2 = 1;
10849 }
10850 else if (rkind < kind2) {
10851 /* widen self and buf1 */
10852 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010853 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010854 assert(buf1 != PyUnicode_DATA(str1));
10855 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010856 buf1 = PyUnicode_DATA(str1);
10857 release1 = 0;
10858 }
10859 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 if (!sbuf) goto error;
10861 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010862 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 if (!buf1) goto error;
10864 release1 = 1;
10865 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010866 u = PyUnicode_New(slen, maxchar);
10867 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010869 assert(PyUnicode_KIND(u) == rkind);
10870 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010871
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010872 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010873 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010874 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010876 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010878
10879 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010880 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010881 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010882 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010883 if (i == -1)
10884 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010885 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010887 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010891 }
10892 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010894 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 int rkind = skind;
10896 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010899 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010900 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 if (!buf1) goto error;
10902 release1 = 1;
10903 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010904 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010905 if (n == 0)
10906 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010908 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010909 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 if (!buf2) goto error;
10911 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010914 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010916 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (!sbuf) goto error;
10918 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010919 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010920 assert(buf1 != PyUnicode_DATA(str1));
10921 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010922 buf1 = PyUnicode_DATA(str1);
10923 release1 = 0;
10924 }
10925 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 if (!buf1) goto error;
10927 release1 = 1;
10928 }
10929 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10930 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010931 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 PyErr_SetString(PyExc_OverflowError,
10933 "replace string is too long");
10934 goto error;
10935 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010936 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010937 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010938 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010939 goto done;
10940 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010941 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 PyErr_SetString(PyExc_OverflowError,
10943 "replace string is too long");
10944 goto error;
10945 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010946 u = PyUnicode_New(new_size, maxchar);
10947 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010949 assert(PyUnicode_KIND(u) == rkind);
10950 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 ires = i = 0;
10952 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010953 while (n-- > 0) {
10954 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010955 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010956 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010957 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010958 if (j == -1)
10959 break;
10960 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010961 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010962 memcpy(res + rkind * ires,
10963 sbuf + rkind * i,
10964 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010966 }
10967 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010969 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010971 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010977 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010978 memcpy(res + rkind * ires,
10979 sbuf + rkind * i,
10980 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010981 }
10982 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010983 /* interleave */
10984 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010985 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010987 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010989 if (--n <= 0)
10990 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010991 memcpy(res + rkind * ires,
10992 sbuf + rkind * i,
10993 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 ires++;
10995 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010996 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010997 memcpy(res + rkind * ires,
10998 sbuf + rkind * i,
10999 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011000 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011001 }
11002
11003 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011004 unicode_adjust_maxchar(&u);
11005 if (u == NULL)
11006 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011008
11009 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011010 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11011 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11012 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011014 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011016 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011018 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011019 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011021
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011023 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011024 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11025 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11026 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011028 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011030 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011032 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011033 return unicode_result_unchanged(self);
11034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011036 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11037 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11038 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11039 if (srelease)
11040 PyMem_FREE((void *)sbuf);
11041 if (release1)
11042 PyMem_FREE((void *)buf1);
11043 if (release2)
11044 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046}
11047
11048/* --- Unicode Object Methods --------------------------------------------- */
11049
INADA Naoki3ae20562017-01-16 20:41:20 +090011050/*[clinic input]
11051str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052
INADA Naoki3ae20562017-01-16 20:41:20 +090011053Return a version of the string where each word is titlecased.
11054
11055More specifically, words start with uppercased characters and all remaining
11056cased characters have lower case.
11057[clinic start generated code]*/
11058
11059static PyObject *
11060unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011061/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011063 if (PyUnicode_READY(self) == -1)
11064 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011065 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066}
11067
INADA Naoki3ae20562017-01-16 20:41:20 +090011068/*[clinic input]
11069str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070
INADA Naoki3ae20562017-01-16 20:41:20 +090011071Return a capitalized version of the string.
11072
11073More specifically, make the first character have upper case and the rest lower
11074case.
11075[clinic start generated code]*/
11076
11077static PyObject *
11078unicode_capitalize_impl(PyObject *self)
11079/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011081 if (PyUnicode_READY(self) == -1)
11082 return NULL;
11083 if (PyUnicode_GET_LENGTH(self) == 0)
11084 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011085 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086}
11087
INADA Naoki3ae20562017-01-16 20:41:20 +090011088/*[clinic input]
11089str.casefold as unicode_casefold
11090
11091Return a version of the string suitable for caseless comparisons.
11092[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011093
11094static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011095unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011096/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011097{
11098 if (PyUnicode_READY(self) == -1)
11099 return NULL;
11100 if (PyUnicode_IS_ASCII(self))
11101 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011102 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011103}
11104
11105
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011106/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011107
11108static int
11109convert_uc(PyObject *obj, void *addr)
11110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011112
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011113 if (!PyUnicode_Check(obj)) {
11114 PyErr_Format(PyExc_TypeError,
11115 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011116 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011117 return 0;
11118 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011119 if (PyUnicode_READY(obj) < 0)
11120 return 0;
11121 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011122 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011123 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011124 return 0;
11125 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011126 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011127 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011128}
11129
INADA Naoki3ae20562017-01-16 20:41:20 +090011130/*[clinic input]
11131str.center as unicode_center
11132
11133 width: Py_ssize_t
11134 fillchar: Py_UCS4 = ' '
11135 /
11136
11137Return a centered string of length width.
11138
11139Padding is done using the specified fill character (default is a space).
11140[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
11142static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011143unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11144/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011146 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
Benjamin Petersonbac79492012-01-14 13:34:47 -050011148 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 return NULL;
11150
Victor Stinnerc4b49542011-12-11 22:44:26 +010011151 if (PyUnicode_GET_LENGTH(self) >= width)
11152 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153
Victor Stinnerc4b49542011-12-11 22:44:26 +010011154 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 left = marg / 2 + (marg & width & 1);
11156
Victor Stinner9310abb2011-10-05 00:59:23 +020011157 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158}
11159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160/* This function assumes that str1 and str2 are readied by the caller. */
11161
Marc-André Lemburge5034372000-08-08 08:04:29 +000011162static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011163unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011164{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011165#define COMPARE(TYPE1, TYPE2) \
11166 do { \
11167 TYPE1* p1 = (TYPE1 *)data1; \
11168 TYPE2* p2 = (TYPE2 *)data2; \
11169 TYPE1* end = p1 + len; \
11170 Py_UCS4 c1, c2; \
11171 for (; p1 != end; p1++, p2++) { \
11172 c1 = *p1; \
11173 c2 = *p2; \
11174 if (c1 != c2) \
11175 return (c1 < c2) ? -1 : 1; \
11176 } \
11177 } \
11178 while (0)
11179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011181 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011182 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 kind1 = PyUnicode_KIND(str1);
11185 kind2 = PyUnicode_KIND(str2);
11186 data1 = PyUnicode_DATA(str1);
11187 data2 = PyUnicode_DATA(str2);
11188 len1 = PyUnicode_GET_LENGTH(str1);
11189 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011190 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011191
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011192 switch(kind1) {
11193 case PyUnicode_1BYTE_KIND:
11194 {
11195 switch(kind2) {
11196 case PyUnicode_1BYTE_KIND:
11197 {
11198 int cmp = memcmp(data1, data2, len);
11199 /* normalize result of memcmp() into the range [-1; 1] */
11200 if (cmp < 0)
11201 return -1;
11202 if (cmp > 0)
11203 return 1;
11204 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011205 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011206 case PyUnicode_2BYTE_KIND:
11207 COMPARE(Py_UCS1, Py_UCS2);
11208 break;
11209 case PyUnicode_4BYTE_KIND:
11210 COMPARE(Py_UCS1, Py_UCS4);
11211 break;
11212 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011213 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011214 }
11215 break;
11216 }
11217 case PyUnicode_2BYTE_KIND:
11218 {
11219 switch(kind2) {
11220 case PyUnicode_1BYTE_KIND:
11221 COMPARE(Py_UCS2, Py_UCS1);
11222 break;
11223 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011224 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011225 COMPARE(Py_UCS2, Py_UCS2);
11226 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011227 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011228 case PyUnicode_4BYTE_KIND:
11229 COMPARE(Py_UCS2, Py_UCS4);
11230 break;
11231 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011232 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011233 }
11234 break;
11235 }
11236 case PyUnicode_4BYTE_KIND:
11237 {
11238 switch(kind2) {
11239 case PyUnicode_1BYTE_KIND:
11240 COMPARE(Py_UCS4, Py_UCS1);
11241 break;
11242 case PyUnicode_2BYTE_KIND:
11243 COMPARE(Py_UCS4, Py_UCS2);
11244 break;
11245 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011246 {
11247#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11248 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11249 /* normalize result of wmemcmp() into the range [-1; 1] */
11250 if (cmp < 0)
11251 return -1;
11252 if (cmp > 0)
11253 return 1;
11254#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011255 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011256#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011257 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011258 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011259 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011260 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011261 }
11262 break;
11263 }
11264 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011265 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011266 }
11267
Victor Stinner770e19e2012-10-04 22:59:45 +020011268 if (len1 == len2)
11269 return 0;
11270 if (len1 < len2)
11271 return -1;
11272 else
11273 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011274
11275#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011276}
11277
Benjamin Peterson621b4302016-09-09 13:54:34 -070011278static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011279unicode_compare_eq(PyObject *str1, PyObject *str2)
11280{
11281 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011282 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011283 Py_ssize_t len;
11284 int cmp;
11285
Victor Stinnere5567ad2012-10-23 02:48:49 +020011286 len = PyUnicode_GET_LENGTH(str1);
11287 if (PyUnicode_GET_LENGTH(str2) != len)
11288 return 0;
11289 kind = PyUnicode_KIND(str1);
11290 if (PyUnicode_KIND(str2) != kind)
11291 return 0;
11292 data1 = PyUnicode_DATA(str1);
11293 data2 = PyUnicode_DATA(str2);
11294
11295 cmp = memcmp(data1, data2, len * kind);
11296 return (cmp == 0);
11297}
11298
11299
Alexander Belopolsky40018472011-02-26 01:02:56 +000011300int
11301PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11304 if (PyUnicode_READY(left) == -1 ||
11305 PyUnicode_READY(right) == -1)
11306 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011307
11308 /* a string is equal to itself */
11309 if (left == right)
11310 return 0;
11311
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011312 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011314 PyErr_Format(PyExc_TypeError,
11315 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011316 Py_TYPE(left)->tp_name,
11317 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 return -1;
11319}
11320
Martin v. Löwis5b222132007-06-10 09:51:05 +000011321int
11322PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 Py_ssize_t i;
11325 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011327 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328
Victor Stinner910337b2011-10-03 03:20:16 +020011329 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011330 if (!PyUnicode_IS_READY(uni)) {
11331 const wchar_t *ws = _PyUnicode_WSTR(uni);
11332 /* Compare Unicode string and source character set string */
11333 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11334 if (chr != ustr[i])
11335 return (chr < ustr[i]) ? -1 : 1;
11336 }
11337 /* This check keeps Python strings that end in '\0' from comparing equal
11338 to C strings identical up to that point. */
11339 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11340 return 1; /* uni is longer */
11341 if (ustr[i])
11342 return -1; /* str is longer */
11343 return 0;
11344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011346 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011347 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011348 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011349 size_t len, len2 = strlen(str);
11350 int cmp;
11351
11352 len = Py_MIN(len1, len2);
11353 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011354 if (cmp != 0) {
11355 if (cmp < 0)
11356 return -1;
11357 else
11358 return 1;
11359 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011360 if (len1 > len2)
11361 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011362 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011363 return -1; /* str is longer */
11364 return 0;
11365 }
11366 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011367 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011368 /* Compare Unicode string and source character set string */
11369 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011370 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011371 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11372 /* This check keeps Python strings that end in '\0' from comparing equal
11373 to C strings identical up to that point. */
11374 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11375 return 1; /* uni is longer */
11376 if (str[i])
11377 return -1; /* str is longer */
11378 return 0;
11379 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011380}
11381
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011382static int
11383non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11384{
11385 size_t i, len;
11386 const wchar_t *p;
11387 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11388 if (strlen(str) != len)
11389 return 0;
11390 p = _PyUnicode_WSTR(unicode);
11391 assert(p);
11392 for (i = 0; i < len; i++) {
11393 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011394 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011395 return 0;
11396 }
11397 return 1;
11398}
11399
11400int
11401_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11402{
11403 size_t len;
11404 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011405 assert(str);
11406#ifndef NDEBUG
11407 for (const char *p = str; *p; p++) {
11408 assert((unsigned char)*p < 128);
11409 }
11410#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011411 if (PyUnicode_READY(unicode) == -1) {
11412 /* Memory error or bad data */
11413 PyErr_Clear();
11414 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11415 }
11416 if (!PyUnicode_IS_ASCII(unicode))
11417 return 0;
11418 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11419 return strlen(str) == len &&
11420 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11421}
11422
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011423int
11424_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11425{
11426 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011427
11428 assert(_PyUnicode_CHECK(left));
11429 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011430#ifndef NDEBUG
11431 for (const char *p = right->string; *p; p++) {
11432 assert((unsigned char)*p < 128);
11433 }
11434#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011435
11436 if (PyUnicode_READY(left) == -1) {
11437 /* memory error or bad data */
11438 PyErr_Clear();
11439 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11440 }
11441
11442 if (!PyUnicode_IS_ASCII(left))
11443 return 0;
11444
11445 right_uni = _PyUnicode_FromId(right); /* borrowed */
11446 if (right_uni == NULL) {
11447 /* memory error or bad data */
11448 PyErr_Clear();
11449 return _PyUnicode_EqualToASCIIString(left, right->string);
11450 }
11451
11452 if (left == right_uni)
11453 return 1;
11454
11455 if (PyUnicode_CHECK_INTERNED(left))
11456 return 0;
11457
Victor Stinner607b1022020-05-05 18:50:30 +020011458#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011459 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011460 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011461 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11462 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011463#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011464
11465 return unicode_compare_eq(left, right_uni);
11466}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011467
Alexander Belopolsky40018472011-02-26 01:02:56 +000011468PyObject *
11469PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011470{
11471 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011472
Victor Stinnere5567ad2012-10-23 02:48:49 +020011473 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11474 Py_RETURN_NOTIMPLEMENTED;
11475
11476 if (PyUnicode_READY(left) == -1 ||
11477 PyUnicode_READY(right) == -1)
11478 return NULL;
11479
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011480 if (left == right) {
11481 switch (op) {
11482 case Py_EQ:
11483 case Py_LE:
11484 case Py_GE:
11485 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011486 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011487 case Py_NE:
11488 case Py_LT:
11489 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011490 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011491 default:
11492 PyErr_BadArgument();
11493 return NULL;
11494 }
11495 }
11496 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011497 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011498 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011499 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011500 }
11501 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011502 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011503 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011504 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011505}
11506
Alexander Belopolsky40018472011-02-26 01:02:56 +000011507int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011508_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11509{
11510 return unicode_eq(aa, bb);
11511}
11512
11513int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011514PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011515{
Victor Stinner77282cb2013-04-14 19:22:47 +020011516 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011517 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011519 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011520
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011521 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011523 "'in <string>' requires string as left operand, not %.100s",
11524 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011525 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011526 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011527 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011528 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011529 if (ensure_unicode(str) < 0)
11530 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011533 kind2 = PyUnicode_KIND(substr);
11534 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011535 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011537 len2 = PyUnicode_GET_LENGTH(substr);
11538 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011539 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011540 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011541 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011542 if (len2 == 1) {
11543 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11544 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011545 return result;
11546 }
11547 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011548 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011549 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011550 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552
Victor Stinner77282cb2013-04-14 19:22:47 +020011553 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 case PyUnicode_1BYTE_KIND:
11555 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11556 break;
11557 case PyUnicode_2BYTE_KIND:
11558 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11559 break;
11560 case PyUnicode_4BYTE_KIND:
11561 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11562 break;
11563 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011564 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011566
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011567 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011568 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011569 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570
Guido van Rossum403d68b2000-03-13 15:55:09 +000011571 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011572}
11573
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574/* Concat to string or Unicode object giving a new Unicode object. */
11575
Alexander Belopolsky40018472011-02-26 01:02:56 +000011576PyObject *
11577PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011579 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011580 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011581 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011583 if (ensure_unicode(left) < 0)
11584 return NULL;
11585
11586 if (!PyUnicode_Check(right)) {
11587 PyErr_Format(PyExc_TypeError,
11588 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011589 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011590 return NULL;
11591 }
11592 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
11595 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011596 PyObject *empty = unicode_get_empty(); // Borrowed reference
11597 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011598 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011599 }
11600 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011601 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011604 left_len = PyUnicode_GET_LENGTH(left);
11605 right_len = PyUnicode_GET_LENGTH(right);
11606 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011607 PyErr_SetString(PyExc_OverflowError,
11608 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011609 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011610 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011611 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011612
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011613 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11614 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011615 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011618 result = PyUnicode_New(new_len, maxchar);
11619 if (result == NULL)
11620 return NULL;
11621 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11622 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11623 assert(_PyUnicode_CheckConsistency(result, 1));
11624 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625}
11626
Walter Dörwald1ab83302007-05-18 17:15:44 +000011627void
Victor Stinner23e56682011-10-03 03:54:37 +020011628PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011629{
Victor Stinner23e56682011-10-03 03:54:37 +020011630 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011631 Py_UCS4 maxchar, maxchar2;
11632 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011633
11634 if (p_left == NULL) {
11635 if (!PyErr_Occurred())
11636 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011637 return;
11638 }
Victor Stinner23e56682011-10-03 03:54:37 +020011639 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011640 if (right == NULL || left == NULL
11641 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011642 if (!PyErr_Occurred())
11643 PyErr_BadInternalCall();
11644 goto error;
11645 }
11646
Benjamin Petersonbac79492012-01-14 13:34:47 -050011647 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011648 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011649 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011650 goto error;
11651
Victor Stinner488fa492011-12-12 00:01:39 +010011652 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011653 PyObject *empty = unicode_get_empty(); // Borrowed reference
11654 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011655 Py_DECREF(left);
11656 Py_INCREF(right);
11657 *p_left = right;
11658 return;
11659 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011660 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011661 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011662 }
Victor Stinner488fa492011-12-12 00:01:39 +010011663
11664 left_len = PyUnicode_GET_LENGTH(left);
11665 right_len = PyUnicode_GET_LENGTH(right);
11666 if (left_len > PY_SSIZE_T_MAX - right_len) {
11667 PyErr_SetString(PyExc_OverflowError,
11668 "strings are too large to concat");
11669 goto error;
11670 }
11671 new_len = left_len + right_len;
11672
11673 if (unicode_modifiable(left)
11674 && PyUnicode_CheckExact(right)
11675 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011676 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11677 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011678 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011679 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011680 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11681 {
11682 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011683 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011684 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011685
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011686 /* copy 'right' into the newly allocated area of 'left' */
11687 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011688 }
Victor Stinner488fa492011-12-12 00:01:39 +010011689 else {
11690 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11691 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011692 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011693
Victor Stinner488fa492011-12-12 00:01:39 +010011694 /* Concat the two Unicode strings */
11695 res = PyUnicode_New(new_len, maxchar);
11696 if (res == NULL)
11697 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011698 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11699 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011700 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011701 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011702 }
11703 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011704 return;
11705
11706error:
Victor Stinner488fa492011-12-12 00:01:39 +010011707 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011708}
11709
11710void
11711PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11712{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011713 PyUnicode_Append(pleft, right);
11714 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011715}
11716
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011717/*
11718Wraps stringlib_parse_args_finds() and additionally ensures that the
11719first argument is a unicode object.
11720*/
11721
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011722static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011723parse_args_finds_unicode(const char * function_name, PyObject *args,
11724 PyObject **substring,
11725 Py_ssize_t *start, Py_ssize_t *end)
11726{
11727 if(stringlib_parse_args_finds(function_name, args, substring,
11728 start, end)) {
11729 if (ensure_unicode(*substring) < 0)
11730 return 0;
11731 return 1;
11732 }
11733 return 0;
11734}
11735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011736PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011737 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011739Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011740string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011741interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
11743static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011744unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011746 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011747 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011748 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011750 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011751 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011754 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 kind1 = PyUnicode_KIND(self);
11758 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011759 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011760 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 len1 = PyUnicode_GET_LENGTH(self);
11763 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011765 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011766 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011767
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011768 buf1 = PyUnicode_DATA(self);
11769 buf2 = PyUnicode_DATA(substring);
11770 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011771 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011772 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011773 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011774 }
11775 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 case PyUnicode_1BYTE_KIND:
11777 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011778 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 buf2, len2, PY_SSIZE_T_MAX
11780 );
11781 break;
11782 case PyUnicode_2BYTE_KIND:
11783 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011784 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 buf2, len2, PY_SSIZE_T_MAX
11786 );
11787 break;
11788 case PyUnicode_4BYTE_KIND:
11789 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011790 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 buf2, len2, PY_SSIZE_T_MAX
11792 );
11793 break;
11794 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011795 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 }
11797
11798 result = PyLong_FromSsize_t(iresult);
11799
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011800 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011801 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011802 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 return result;
11805}
11806
INADA Naoki3ae20562017-01-16 20:41:20 +090011807/*[clinic input]
11808str.encode as unicode_encode
11809
11810 encoding: str(c_default="NULL") = 'utf-8'
11811 The encoding in which to encode the string.
11812 errors: str(c_default="NULL") = 'strict'
11813 The error handling scheme to use for encoding errors.
11814 The default is 'strict' meaning that encoding errors raise a
11815 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11816 'xmlcharrefreplace' as well as any other name registered with
11817 codecs.register_error that can handle UnicodeEncodeErrors.
11818
11819Encode the string using the codec registered for encoding.
11820[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
11822static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011823unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011824/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011826 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011827}
11828
INADA Naoki3ae20562017-01-16 20:41:20 +090011829/*[clinic input]
11830str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
INADA Naoki3ae20562017-01-16 20:41:20 +090011832 tabsize: int = 8
11833
11834Return a copy where all tab characters are expanded using spaces.
11835
11836If tabsize is not given, a tab size of 8 characters is assumed.
11837[clinic start generated code]*/
11838
11839static PyObject *
11840unicode_expandtabs_impl(PyObject *self, int tabsize)
11841/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011843 Py_ssize_t i, j, line_pos, src_len, incr;
11844 Py_UCS4 ch;
11845 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011846 const void *src_data;
11847 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011848 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011849 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850
Antoine Pitrou22425222011-10-04 19:10:51 +020011851 if (PyUnicode_READY(self) == -1)
11852 return NULL;
11853
Thomas Wouters7e474022000-07-16 12:04:32 +000011854 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011855 src_len = PyUnicode_GET_LENGTH(self);
11856 i = j = line_pos = 0;
11857 kind = PyUnicode_KIND(self);
11858 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011859 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011860 for (; i < src_len; i++) {
11861 ch = PyUnicode_READ(kind, src_data, i);
11862 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011863 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011865 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011866 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011867 goto overflow;
11868 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011870 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011874 goto overflow;
11875 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011877 if (ch == '\n' || ch == '\r')
11878 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011880 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011881 if (!found)
11882 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011883
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011885 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886 if (!u)
11887 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011888 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
Antoine Pitroue71d5742011-10-04 15:55:09 +020011890 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891
Antoine Pitroue71d5742011-10-04 15:55:09 +020011892 for (; i < src_len; i++) {
11893 ch = PyUnicode_READ(kind, src_data, i);
11894 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011896 incr = tabsize - (line_pos % tabsize);
11897 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011898 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011899 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011901 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011903 line_pos++;
11904 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011905 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011906 if (ch == '\n' || ch == '\r')
11907 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011909 }
11910 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011911 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011912
Antoine Pitroue71d5742011-10-04 15:55:09 +020011913 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011914 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916}
11917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011918PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920\n\
11921Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011922such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923arguments start and end are interpreted as in slice notation.\n\
11924\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011925Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
11927static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011930 /* initialize variables to prevent gcc warning */
11931 PyObject *substring = NULL;
11932 Py_ssize_t start = 0;
11933 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011934 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011936 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011939 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011942 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 if (result == -2)
11945 return NULL;
11946
Christian Heimes217cfd12007-12-02 14:31:20 +000011947 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948}
11949
11950static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011951unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011953 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011954 enum PyUnicode_Kind kind;
11955 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011956
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011957 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011958 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011960 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011961 if (PyUnicode_READY(self) == -1) {
11962 return NULL;
11963 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011964 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11965 PyErr_SetString(PyExc_IndexError, "string index out of range");
11966 return NULL;
11967 }
11968 kind = PyUnicode_KIND(self);
11969 data = PyUnicode_DATA(self);
11970 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011971 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972}
11973
Guido van Rossumc2504932007-09-18 19:42:40 +000011974/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011975 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011976static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011977unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011979 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011980
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011981#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011982 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011983#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (_PyUnicode_HASH(self) != -1)
11985 return _PyUnicode_HASH(self);
11986 if (PyUnicode_READY(self) == -1)
11987 return -1;
animalizea1d14252019-01-02 20:16:06 +080011988
Christian Heimes985ecdc2013-11-20 11:46:18 +010011989 x = _Py_HashBytes(PyUnicode_DATA(self),
11990 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011992 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993}
11994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011995PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997\n\
oldkaa0735f2018-02-02 16:52:55 +080011998Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011999such that sub is contained within S[start:end]. Optional\n\
12000arguments start and end are interpreted as in slice notation.\n\
12001\n\
12002Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012007 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012008 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012009 PyObject *substring = NULL;
12010 Py_ssize_t start = 0;
12011 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012013 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012016 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012019 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 if (result == -2)
12022 return NULL;
12023
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 if (result < 0) {
12025 PyErr_SetString(PyExc_ValueError, "substring not found");
12026 return NULL;
12027 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012028
Christian Heimes217cfd12007-12-02 14:31:20 +000012029 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030}
12031
INADA Naoki3ae20562017-01-16 20:41:20 +090012032/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012033str.isascii as unicode_isascii
12034
12035Return True if all characters in the string are ASCII, False otherwise.
12036
12037ASCII characters have code points in the range U+0000-U+007F.
12038Empty string is ASCII too.
12039[clinic start generated code]*/
12040
12041static PyObject *
12042unicode_isascii_impl(PyObject *self)
12043/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12044{
12045 if (PyUnicode_READY(self) == -1) {
12046 return NULL;
12047 }
12048 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12049}
12050
12051/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012052str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
INADA Naoki3ae20562017-01-16 20:41:20 +090012054Return True if the string is a lowercase string, False otherwise.
12055
12056A string is lowercase if all cased characters in the string are lowercase and
12057there is at least one cased character in the string.
12058[clinic start generated code]*/
12059
12060static PyObject *
12061unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012062/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 Py_ssize_t i, length;
12065 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012066 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067 int cased;
12068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (PyUnicode_READY(self) == -1)
12070 return NULL;
12071 length = PyUnicode_GET_LENGTH(self);
12072 kind = PyUnicode_KIND(self);
12073 data = PyUnicode_DATA(self);
12074
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (length == 1)
12077 return PyBool_FromLong(
12078 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012080 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012082 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012083
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 for (i = 0; i < length; i++) {
12086 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012087
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012089 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 else if (!cased && Py_UNICODE_ISLOWER(ch))
12091 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012093 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094}
12095
INADA Naoki3ae20562017-01-16 20:41:20 +090012096/*[clinic input]
12097str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
INADA Naoki3ae20562017-01-16 20:41:20 +090012099Return True if the string is an uppercase string, False otherwise.
12100
12101A string is uppercase if all cased characters in the string are uppercase and
12102there is at least one cased character in the string.
12103[clinic start generated code]*/
12104
12105static PyObject *
12106unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012107/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 Py_ssize_t i, length;
12110 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012111 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112 int cased;
12113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 if (PyUnicode_READY(self) == -1)
12115 return NULL;
12116 length = PyUnicode_GET_LENGTH(self);
12117 kind = PyUnicode_KIND(self);
12118 data = PyUnicode_DATA(self);
12119
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 if (length == 1)
12122 return PyBool_FromLong(
12123 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012125 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012127 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012128
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 for (i = 0; i < length; i++) {
12131 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012132
Benjamin Peterson29060642009-01-31 22:14:21 +000012133 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012134 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 else if (!cased && Py_UNICODE_ISUPPER(ch))
12136 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012138 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139}
12140
INADA Naoki3ae20562017-01-16 20:41:20 +090012141/*[clinic input]
12142str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
INADA Naoki3ae20562017-01-16 20:41:20 +090012144Return True if the string is a title-cased string, False otherwise.
12145
12146In a title-cased string, upper- and title-case characters may only
12147follow uncased characters and lowercase characters only cased ones.
12148[clinic start generated code]*/
12149
12150static PyObject *
12151unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012152/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 Py_ssize_t i, length;
12155 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012156 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157 int cased, previous_is_cased;
12158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (PyUnicode_READY(self) == -1)
12160 return NULL;
12161 length = PyUnicode_GET_LENGTH(self);
12162 kind = PyUnicode_KIND(self);
12163 data = PyUnicode_DATA(self);
12164
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 if (length == 1) {
12167 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12168 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12169 (Py_UNICODE_ISUPPER(ch) != 0));
12170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012172 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012174 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012175
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176 cased = 0;
12177 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 for (i = 0; i < length; i++) {
12179 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012180
Benjamin Peterson29060642009-01-31 22:14:21 +000012181 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12182 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012183 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 previous_is_cased = 1;
12185 cased = 1;
12186 }
12187 else if (Py_UNICODE_ISLOWER(ch)) {
12188 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012189 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012190 previous_is_cased = 1;
12191 cased = 1;
12192 }
12193 else
12194 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012196 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197}
12198
INADA Naoki3ae20562017-01-16 20:41:20 +090012199/*[clinic input]
12200str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
INADA Naoki3ae20562017-01-16 20:41:20 +090012202Return True if the string is a whitespace string, False otherwise.
12203
12204A string is whitespace if all characters in the string are whitespace and there
12205is at least one character in the string.
12206[clinic start generated code]*/
12207
12208static PyObject *
12209unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012210/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 Py_ssize_t i, length;
12213 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012214 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215
12216 if (PyUnicode_READY(self) == -1)
12217 return NULL;
12218 length = PyUnicode_GET_LENGTH(self);
12219 kind = PyUnicode_KIND(self);
12220 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 if (length == 1)
12224 return PyBool_FromLong(
12225 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012227 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012229 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 for (i = 0; i < length; i++) {
12232 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012233 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012234 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012236 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
INADA Naoki3ae20562017-01-16 20:41:20 +090012239/*[clinic input]
12240str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012241
INADA Naoki3ae20562017-01-16 20:41:20 +090012242Return True if the string is an alphabetic string, False otherwise.
12243
12244A string is alphabetic if all characters in the string are alphabetic and there
12245is at least one character in the string.
12246[clinic start generated code]*/
12247
12248static PyObject *
12249unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012250/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 Py_ssize_t i, length;
12253 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012254 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255
12256 if (PyUnicode_READY(self) == -1)
12257 return NULL;
12258 length = PyUnicode_GET_LENGTH(self);
12259 kind = PyUnicode_KIND(self);
12260 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012261
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012262 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (length == 1)
12264 return PyBool_FromLong(
12265 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012266
12267 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012269 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 for (i = 0; i < length; i++) {
12272 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012273 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012274 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012275 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012276}
12277
INADA Naoki3ae20562017-01-16 20:41:20 +090012278/*[clinic input]
12279str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281Return True if the string is an alpha-numeric string, False otherwise.
12282
12283A string is alpha-numeric if all characters in the string are alpha-numeric and
12284there is at least one character in the string.
12285[clinic start generated code]*/
12286
12287static PyObject *
12288unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012289/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012292 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 Py_ssize_t len, i;
12294
12295 if (PyUnicode_READY(self) == -1)
12296 return NULL;
12297
12298 kind = PyUnicode_KIND(self);
12299 data = PyUnicode_DATA(self);
12300 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012301
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012302 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (len == 1) {
12304 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12305 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12306 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012307
12308 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012310 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 for (i = 0; i < len; i++) {
12313 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012314 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012315 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012316 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012317 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012318}
12319
INADA Naoki3ae20562017-01-16 20:41:20 +090012320/*[clinic input]
12321str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
INADA Naoki3ae20562017-01-16 20:41:20 +090012323Return True if the string is a decimal string, False otherwise.
12324
12325A string is a decimal string if all characters in the string are decimal and
12326there is at least one character in the string.
12327[clinic start generated code]*/
12328
12329static PyObject *
12330unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012331/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 Py_ssize_t i, length;
12334 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012335 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336
12337 if (PyUnicode_READY(self) == -1)
12338 return NULL;
12339 length = PyUnicode_GET_LENGTH(self);
12340 kind = PyUnicode_KIND(self);
12341 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 if (length == 1)
12345 return PyBool_FromLong(
12346 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012348 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012350 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 for (i = 0; i < length; i++) {
12353 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012354 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012356 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357}
12358
INADA Naoki3ae20562017-01-16 20:41:20 +090012359/*[clinic input]
12360str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
INADA Naoki3ae20562017-01-16 20:41:20 +090012362Return True if the string is a digit string, False otherwise.
12363
12364A string is a digit string if all characters in the string are digits and there
12365is at least one character in the string.
12366[clinic start generated code]*/
12367
12368static PyObject *
12369unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012370/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 Py_ssize_t i, length;
12373 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012374 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375
12376 if (PyUnicode_READY(self) == -1)
12377 return NULL;
12378 length = PyUnicode_GET_LENGTH(self);
12379 kind = PyUnicode_KIND(self);
12380 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 if (length == 1) {
12384 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12385 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012388 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012390 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 for (i = 0; i < length; i++) {
12393 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012394 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012396 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397}
12398
INADA Naoki3ae20562017-01-16 20:41:20 +090012399/*[clinic input]
12400str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401
INADA Naoki3ae20562017-01-16 20:41:20 +090012402Return True if the string is a numeric string, False otherwise.
12403
12404A string is numeric if all characters in the string are numeric and there is at
12405least one character in the string.
12406[clinic start generated code]*/
12407
12408static PyObject *
12409unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012410/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 Py_ssize_t i, length;
12413 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012414 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415
12416 if (PyUnicode_READY(self) == -1)
12417 return NULL;
12418 length = PyUnicode_GET_LENGTH(self);
12419 kind = PyUnicode_KIND(self);
12420 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 if (length == 1)
12424 return PyBool_FromLong(
12425 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012427 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012429 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 for (i = 0; i < length; i++) {
12432 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012433 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012435 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436}
12437
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012438Py_ssize_t
12439_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012442 if (PyUnicode_READY(self) == -1)
12443 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012444
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012445 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012446 if (len == 0) {
12447 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012448 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 }
12450
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012451 int kind = PyUnicode_KIND(self);
12452 const void *data = PyUnicode_DATA(self);
12453 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012454 /* PEP 3131 says that the first character must be in
12455 XID_Start and subsequent characters in XID_Continue,
12456 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012458 letters, digits, underscore). However, given the current
12459 definition of XID_Start and XID_Continue, it is sufficient
12460 to check just for these, except that _ must be allowed
12461 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012462 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012463 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012464 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012465
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012466 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012467 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012468 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012469 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012470 }
12471 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012472 return i;
12473}
12474
12475int
12476PyUnicode_IsIdentifier(PyObject *self)
12477{
12478 if (PyUnicode_IS_READY(self)) {
12479 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12480 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12481 /* an empty string is not a valid identifier */
12482 return len && i == len;
12483 }
12484 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012485_Py_COMP_DIAG_PUSH
12486_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012487 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012488 if (len == 0) {
12489 /* an empty string is not a valid identifier */
12490 return 0;
12491 }
12492
12493 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012494 Py_UCS4 ch = wstr[i++];
12495#if SIZEOF_WCHAR_T == 2
12496 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12497 && i < len
12498 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12499 {
12500 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12501 i++;
12502 }
12503#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012504 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12505 return 0;
12506 }
12507
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012508 while (i < len) {
12509 ch = wstr[i++];
12510#if SIZEOF_WCHAR_T == 2
12511 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12512 && i < len
12513 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12514 {
12515 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12516 i++;
12517 }
12518#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012519 if (!_PyUnicode_IsXidContinue(ch)) {
12520 return 0;
12521 }
12522 }
12523 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012524_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012525 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012526}
12527
INADA Naoki3ae20562017-01-16 20:41:20 +090012528/*[clinic input]
12529str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012530
INADA Naoki3ae20562017-01-16 20:41:20 +090012531Return True if the string is a valid Python identifier, False otherwise.
12532
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012533Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012534such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012535[clinic start generated code]*/
12536
12537static PyObject *
12538unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012539/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012540{
12541 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12542}
12543
INADA Naoki3ae20562017-01-16 20:41:20 +090012544/*[clinic input]
12545str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012546
INADA Naoki3ae20562017-01-16 20:41:20 +090012547Return True if the string is printable, False otherwise.
12548
12549A string is printable if all of its characters are considered printable in
12550repr() or if it is empty.
12551[clinic start generated code]*/
12552
12553static PyObject *
12554unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012555/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012556{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 Py_ssize_t i, length;
12558 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012559 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560
12561 if (PyUnicode_READY(self) == -1)
12562 return NULL;
12563 length = PyUnicode_GET_LENGTH(self);
12564 kind = PyUnicode_KIND(self);
12565 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012566
12567 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 if (length == 1)
12569 return PyBool_FromLong(
12570 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 for (i = 0; i < length; i++) {
12573 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012574 Py_RETURN_FALSE;
12575 }
12576 }
12577 Py_RETURN_TRUE;
12578}
12579
INADA Naoki3ae20562017-01-16 20:41:20 +090012580/*[clinic input]
12581str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
INADA Naoki3ae20562017-01-16 20:41:20 +090012583 iterable: object
12584 /
12585
12586Concatenate any number of strings.
12587
Martin Panter91a88662017-01-24 00:30:06 +000012588The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012589The result is returned as a new string.
12590
12591Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12592[clinic start generated code]*/
12593
12594static PyObject *
12595unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012596/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597{
INADA Naoki3ae20562017-01-16 20:41:20 +090012598 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599}
12600
Martin v. Löwis18e16552006-02-15 17:27:45 +000012601static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012602unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 if (PyUnicode_READY(self) == -1)
12605 return -1;
12606 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607}
12608
INADA Naoki3ae20562017-01-16 20:41:20 +090012609/*[clinic input]
12610str.ljust as unicode_ljust
12611
12612 width: Py_ssize_t
12613 fillchar: Py_UCS4 = ' '
12614 /
12615
12616Return a left-justified string of length width.
12617
12618Padding is done using the specified fill character (default is a space).
12619[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620
12621static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012622unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12623/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012625 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
Victor Stinnerc4b49542011-12-11 22:44:26 +010012628 if (PyUnicode_GET_LENGTH(self) >= width)
12629 return unicode_result_unchanged(self);
12630
12631 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632}
12633
INADA Naoki3ae20562017-01-16 20:41:20 +090012634/*[clinic input]
12635str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636
INADA Naoki3ae20562017-01-16 20:41:20 +090012637Return a copy of the string converted to lowercase.
12638[clinic start generated code]*/
12639
12640static PyObject *
12641unicode_lower_impl(PyObject *self)
12642/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012644 if (PyUnicode_READY(self) == -1)
12645 return NULL;
12646 if (PyUnicode_IS_ASCII(self))
12647 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012648 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649}
12650
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012651#define LEFTSTRIP 0
12652#define RIGHTSTRIP 1
12653#define BOTHSTRIP 2
12654
12655/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012656static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012657
INADA Naoki3ae20562017-01-16 20:41:20 +090012658#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012659
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012660/* externally visible for str.strip(unicode) */
12661PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012662_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012663{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012664 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 int kind;
12666 Py_ssize_t i, j, len;
12667 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012668 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12671 return NULL;
12672
12673 kind = PyUnicode_KIND(self);
12674 data = PyUnicode_DATA(self);
12675 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012676 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12678 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012679 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012680
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681 i = 0;
12682 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012683 while (i < len) {
12684 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12685 if (!BLOOM(sepmask, ch))
12686 break;
12687 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12688 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 i++;
12690 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012691 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012692
Benjamin Peterson14339b62009-01-31 16:36:08 +000012693 j = len;
12694 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012695 j--;
12696 while (j >= i) {
12697 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12698 if (!BLOOM(sepmask, ch))
12699 break;
12700 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12701 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012703 }
12704
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012707
Victor Stinner7931d9a2011-11-04 00:22:48 +010012708 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709}
12710
12711PyObject*
12712PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12713{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012714 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012716 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717
Victor Stinnerde636f32011-10-01 03:55:54 +020012718 if (PyUnicode_READY(self) == -1)
12719 return NULL;
12720
Victor Stinner684d5fd2012-05-03 02:32:34 +020012721 length = PyUnicode_GET_LENGTH(self);
12722 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012723
Victor Stinner684d5fd2012-05-03 02:32:34 +020012724 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012725 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726
Victor Stinnerde636f32011-10-01 03:55:54 +020012727 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012728 PyErr_SetString(PyExc_IndexError, "string index out of range");
12729 return NULL;
12730 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012731 if (start >= length || end < start)
12732 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012733
Victor Stinner684d5fd2012-05-03 02:32:34 +020012734 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012735 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012736 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012737 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012738 }
12739 else {
12740 kind = PyUnicode_KIND(self);
12741 data = PyUnicode_1BYTE_DATA(self);
12742 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012743 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012744 length);
12745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
12748static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012749do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 Py_ssize_t len, i, j;
12752
12753 if (PyUnicode_READY(self) == -1)
12754 return NULL;
12755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012757
Victor Stinnercc7af722013-04-09 22:39:24 +020012758 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012759 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012760
12761 i = 0;
12762 if (striptype != RIGHTSTRIP) {
12763 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012764 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012765 if (!_Py_ascii_whitespace[ch])
12766 break;
12767 i++;
12768 }
12769 }
12770
12771 j = len;
12772 if (striptype != LEFTSTRIP) {
12773 j--;
12774 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012775 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012776 if (!_Py_ascii_whitespace[ch])
12777 break;
12778 j--;
12779 }
12780 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 }
12782 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012783 else {
12784 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012785 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012786
Victor Stinnercc7af722013-04-09 22:39:24 +020012787 i = 0;
12788 if (striptype != RIGHTSTRIP) {
12789 while (i < len) {
12790 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12791 if (!Py_UNICODE_ISSPACE(ch))
12792 break;
12793 i++;
12794 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012795 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012796
12797 j = len;
12798 if (striptype != LEFTSTRIP) {
12799 j--;
12800 while (j >= i) {
12801 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12802 if (!Py_UNICODE_ISSPACE(ch))
12803 break;
12804 j--;
12805 }
12806 j++;
12807 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012809
Victor Stinner7931d9a2011-11-04 00:22:48 +010012810 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811}
12812
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012813
12814static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012815do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012816{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012817 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 if (PyUnicode_Check(sep))
12819 return _PyUnicode_XStrip(self, striptype, sep);
12820 else {
12821 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 "%s arg must be None or str",
12823 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012824 return NULL;
12825 }
12826 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012827
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012829}
12830
12831
INADA Naoki3ae20562017-01-16 20:41:20 +090012832/*[clinic input]
12833str.strip as unicode_strip
12834
12835 chars: object = None
12836 /
12837
Zachary Ware09895c22019-10-09 16:09:00 -050012838Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012839
12840If chars is given and not None, remove characters in chars instead.
12841[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012842
12843static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012844unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012845/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012846{
INADA Naoki3ae20562017-01-16 20:41:20 +090012847 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012848}
12849
12850
INADA Naoki3ae20562017-01-16 20:41:20 +090012851/*[clinic input]
12852str.lstrip as unicode_lstrip
12853
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012854 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012855 /
12856
12857Return a copy of the string with leading whitespace removed.
12858
12859If chars is given and not None, remove characters in chars instead.
12860[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012861
12862static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012863unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012864/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012865{
INADA Naoki3ae20562017-01-16 20:41:20 +090012866 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012867}
12868
12869
INADA Naoki3ae20562017-01-16 20:41:20 +090012870/*[clinic input]
12871str.rstrip as unicode_rstrip
12872
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012873 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012874 /
12875
12876Return a copy of the string with trailing whitespace removed.
12877
12878If chars is given and not None, remove characters in chars instead.
12879[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012880
12881static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012882unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012883/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012884{
INADA Naoki3ae20562017-01-16 20:41:20 +090012885 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012886}
12887
12888
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012890unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012892 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894
Serhiy Storchaka05997252013-01-26 12:14:02 +020012895 if (len < 1)
12896 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897
Victor Stinnerc4b49542011-12-11 22:44:26 +010012898 /* no repeat, return original string */
12899 if (len == 1)
12900 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012901
Benjamin Petersonbac79492012-01-14 13:34:47 -050012902 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 return NULL;
12904
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012905 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012906 PyErr_SetString(PyExc_OverflowError,
12907 "repeated string is too long");
12908 return NULL;
12909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012911
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012912 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913 if (!u)
12914 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012915 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012918 int kind = PyUnicode_KIND(str);
12919 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012920 if (kind == PyUnicode_1BYTE_KIND) {
12921 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012922 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012923 }
12924 else if (kind == PyUnicode_2BYTE_KIND) {
12925 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012926 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012927 ucs2[n] = fill_char;
12928 } else {
12929 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12930 assert(kind == PyUnicode_4BYTE_KIND);
12931 for (n = 0; n < len; ++n)
12932 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 }
12935 else {
12936 /* number of characters copied this far */
12937 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012938 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012940 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012944 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012945 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947 }
12948
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012949 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012950 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951}
12952
Alexander Belopolsky40018472011-02-26 01:02:56 +000012953PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012954PyUnicode_Replace(PyObject *str,
12955 PyObject *substr,
12956 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012957 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012959 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12960 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012961 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012962 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
INADA Naoki3ae20562017-01-16 20:41:20 +090012965/*[clinic input]
12966str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967
INADA Naoki3ae20562017-01-16 20:41:20 +090012968 old: unicode
12969 new: unicode
12970 count: Py_ssize_t = -1
12971 Maximum number of occurrences to replace.
12972 -1 (the default value) means replace all occurrences.
12973 /
12974
12975Return a copy with all occurrences of substring old replaced by new.
12976
12977If the optional argument count is given, only the first count occurrences are
12978replaced.
12979[clinic start generated code]*/
12980
12981static PyObject *
12982unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12983 Py_ssize_t count)
12984/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012986 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012988 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989}
12990
sweeneydea81849b2020-04-22 17:05:48 -040012991/*[clinic input]
12992str.removeprefix as unicode_removeprefix
12993
12994 prefix: unicode
12995 /
12996
12997Return a str with the given prefix string removed if present.
12998
12999If the string starts with the prefix string, return string[len(prefix):].
13000Otherwise, return a copy of the original string.
13001[clinic start generated code]*/
13002
13003static PyObject *
13004unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13005/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13006{
13007 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13008 if (match == -1) {
13009 return NULL;
13010 }
13011 if (match) {
13012 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13013 PyUnicode_GET_LENGTH(self));
13014 }
13015 return unicode_result_unchanged(self);
13016}
13017
13018/*[clinic input]
13019str.removesuffix as unicode_removesuffix
13020
13021 suffix: unicode
13022 /
13023
13024Return a str with the given suffix string removed if present.
13025
13026If the string ends with the suffix string and that suffix is not empty,
13027return string[:-len(suffix)]. Otherwise, return a copy of the original
13028string.
13029[clinic start generated code]*/
13030
13031static PyObject *
13032unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13033/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13034{
13035 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13036 if (match == -1) {
13037 return NULL;
13038 }
13039 if (match) {
13040 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13041 - PyUnicode_GET_LENGTH(suffix));
13042 }
13043 return unicode_result_unchanged(self);
13044}
13045
Alexander Belopolsky40018472011-02-26 01:02:56 +000013046static PyObject *
13047unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013049 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 Py_ssize_t isize;
13051 Py_ssize_t osize, squote, dquote, i, o;
13052 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013053 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013054 const void *idata;
13055 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013058 return NULL;
13059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 isize = PyUnicode_GET_LENGTH(unicode);
13061 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 /* Compute length of output, quote characters, and
13064 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013065 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 max = 127;
13067 squote = dquote = 0;
13068 ikind = PyUnicode_KIND(unicode);
13069 for (i = 0; i < isize; i++) {
13070 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013071 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013073 case '\'': squote++; break;
13074 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013075 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013076 incr = 2;
13077 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 default:
13079 /* Fast-path ASCII */
13080 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013081 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013082 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013083 ;
13084 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013087 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013088 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013089 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013091 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013093 if (osize > PY_SSIZE_T_MAX - incr) {
13094 PyErr_SetString(PyExc_OverflowError,
13095 "string is too long to generate repr");
13096 return NULL;
13097 }
13098 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 }
13100
13101 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013102 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013104 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 if (dquote)
13106 /* Both squote and dquote present. Use squote,
13107 and escape them */
13108 osize += squote;
13109 else
13110 quote = '"';
13111 }
Victor Stinner55c08782013-04-14 18:45:39 +020013112 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113
13114 repr = PyUnicode_New(osize, max);
13115 if (repr == NULL)
13116 return NULL;
13117 okind = PyUnicode_KIND(repr);
13118 odata = PyUnicode_DATA(repr);
13119
13120 PyUnicode_WRITE(okind, odata, 0, quote);
13121 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013122 if (unchanged) {
13123 _PyUnicode_FastCopyCharacters(repr, 1,
13124 unicode, 0,
13125 isize);
13126 }
13127 else {
13128 for (i = 0, o = 1; i < isize; i++) {
13129 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130
Victor Stinner55c08782013-04-14 18:45:39 +020013131 /* Escape quotes and backslashes */
13132 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013133 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013135 continue;
13136 }
13137
13138 /* Map special whitespace to '\t', \n', '\r' */
13139 if (ch == '\t') {
13140 PyUnicode_WRITE(okind, odata, o++, '\\');
13141 PyUnicode_WRITE(okind, odata, o++, 't');
13142 }
13143 else if (ch == '\n') {
13144 PyUnicode_WRITE(okind, odata, o++, '\\');
13145 PyUnicode_WRITE(okind, odata, o++, 'n');
13146 }
13147 else if (ch == '\r') {
13148 PyUnicode_WRITE(okind, odata, o++, '\\');
13149 PyUnicode_WRITE(okind, odata, o++, 'r');
13150 }
13151
13152 /* Map non-printable US ASCII to '\xhh' */
13153 else if (ch < ' ' || ch == 0x7F) {
13154 PyUnicode_WRITE(okind, odata, o++, '\\');
13155 PyUnicode_WRITE(okind, odata, o++, 'x');
13156 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13157 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13158 }
13159
13160 /* Copy ASCII characters as-is */
13161 else if (ch < 0x7F) {
13162 PyUnicode_WRITE(okind, odata, o++, ch);
13163 }
13164
13165 /* Non-ASCII characters */
13166 else {
13167 /* Map Unicode whitespace and control characters
13168 (categories Z* and C* except ASCII space)
13169 */
13170 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13171 PyUnicode_WRITE(okind, odata, o++, '\\');
13172 /* Map 8-bit characters to '\xhh' */
13173 if (ch <= 0xff) {
13174 PyUnicode_WRITE(okind, odata, o++, 'x');
13175 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13176 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13177 }
13178 /* Map 16-bit characters to '\uxxxx' */
13179 else if (ch <= 0xffff) {
13180 PyUnicode_WRITE(okind, odata, o++, 'u');
13181 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13182 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13183 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13184 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13185 }
13186 /* Map 21-bit characters to '\U00xxxxxx' */
13187 else {
13188 PyUnicode_WRITE(okind, odata, o++, 'U');
13189 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13190 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13191 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13192 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13193 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13194 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13195 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13196 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13197 }
13198 }
13199 /* Copy characters as-is */
13200 else {
13201 PyUnicode_WRITE(okind, odata, o++, ch);
13202 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013203 }
13204 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013207 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013208 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209}
13210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013211PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213\n\
13214Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013215such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216arguments start and end are interpreted as in slice notation.\n\
13217\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013218Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219
13220static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013223 /* initialize variables to prevent gcc warning */
13224 PyObject *substring = NULL;
13225 Py_ssize_t start = 0;
13226 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013227 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013229 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013232 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013235 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 if (result == -2)
13238 return NULL;
13239
Christian Heimes217cfd12007-12-02 14:31:20 +000013240 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241}
13242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013243PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013244 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013246Return the highest index in S where substring sub is found,\n\
13247such that sub is contained within S[start:end]. Optional\n\
13248arguments start and end are interpreted as in slice notation.\n\
13249\n\
13250Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251
13252static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013255 /* initialize variables to prevent gcc warning */
13256 PyObject *substring = NULL;
13257 Py_ssize_t start = 0;
13258 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013259 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013261 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013264 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013267 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 if (result == -2)
13270 return NULL;
13271
Guido van Rossumd57fd912000-03-10 22:53:23 +000013272 if (result < 0) {
13273 PyErr_SetString(PyExc_ValueError, "substring not found");
13274 return NULL;
13275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276
Christian Heimes217cfd12007-12-02 14:31:20 +000013277 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278}
13279
INADA Naoki3ae20562017-01-16 20:41:20 +090013280/*[clinic input]
13281str.rjust as unicode_rjust
13282
13283 width: Py_ssize_t
13284 fillchar: Py_UCS4 = ' '
13285 /
13286
13287Return a right-justified string of length width.
13288
13289Padding is done using the specified fill character (default is a space).
13290[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291
13292static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013293unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13294/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013296 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297 return NULL;
13298
Victor Stinnerc4b49542011-12-11 22:44:26 +010013299 if (PyUnicode_GET_LENGTH(self) >= width)
13300 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301
Victor Stinnerc4b49542011-12-11 22:44:26 +010013302 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303}
13304
Alexander Belopolsky40018472011-02-26 01:02:56 +000013305PyObject *
13306PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013308 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013311 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312}
13313
INADA Naoki3ae20562017-01-16 20:41:20 +090013314/*[clinic input]
13315str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316
INADA Naoki3ae20562017-01-16 20:41:20 +090013317 sep: object = None
13318 The delimiter according which to split the string.
13319 None (the default value) means split according to any whitespace,
13320 and discard empty strings from the result.
13321 maxsplit: Py_ssize_t = -1
13322 Maximum number of splits to do.
13323 -1 (the default value) means no limit.
13324
13325Return a list of the words in the string, using sep as the delimiter string.
13326[clinic start generated code]*/
13327
13328static PyObject *
13329unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13330/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331{
INADA Naoki3ae20562017-01-16 20:41:20 +090013332 if (sep == Py_None)
13333 return split(self, NULL, maxsplit);
13334 if (PyUnicode_Check(sep))
13335 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013336
Victor Stinner998b8062018-09-12 00:23:25 +020013337 PyErr_Format(PyExc_TypeError,
13338 "must be str or None, not %.100s",
13339 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341}
13342
Thomas Wouters477c8d52006-05-27 19:21:47 +000013343PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013344PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013345{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013346 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013347 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013348 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013350
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013351 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013353
Victor Stinner14f8f022011-10-05 20:58:25 +020013354 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013356 len1 = PyUnicode_GET_LENGTH(str_obj);
13357 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013358 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013359 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013360 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013361 }
13362 buf1 = PyUnicode_DATA(str_obj);
13363 buf2 = PyUnicode_DATA(sep_obj);
13364 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013365 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013366 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013367 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013369
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013370 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013372 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13373 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13374 else
13375 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013376 break;
13377 case PyUnicode_2BYTE_KIND:
13378 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13379 break;
13380 case PyUnicode_4BYTE_KIND:
13381 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13382 break;
13383 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013384 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013386
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013387 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013388 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013389 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013390
13391 return out;
13392}
13393
13394
13395PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013397{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013398 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013399 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013400 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013401 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013402
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013405
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013406 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013408 len1 = PyUnicode_GET_LENGTH(str_obj);
13409 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013410 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013411 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013412 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013413 }
13414 buf1 = PyUnicode_DATA(str_obj);
13415 buf2 = PyUnicode_DATA(sep_obj);
13416 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013417 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013418 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013419 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013422 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013423 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013424 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13425 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13426 else
13427 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013428 break;
13429 case PyUnicode_2BYTE_KIND:
13430 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13431 break;
13432 case PyUnicode_4BYTE_KIND:
13433 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13434 break;
13435 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013436 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013437 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013438
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013439 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013440 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013441 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013442
13443 return out;
13444}
13445
INADA Naoki3ae20562017-01-16 20:41:20 +090013446/*[clinic input]
13447str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013448
INADA Naoki3ae20562017-01-16 20:41:20 +090013449 sep: object
13450 /
13451
13452Partition the string into three parts using the given separator.
13453
13454This will search for the separator in the string. If the separator is found,
13455returns a 3-tuple containing the part before the separator, the separator
13456itself, and the part after it.
13457
13458If the separator is not found, returns a 3-tuple containing the original string
13459and two empty strings.
13460[clinic start generated code]*/
13461
13462static PyObject *
13463unicode_partition(PyObject *self, PyObject *sep)
13464/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013465{
INADA Naoki3ae20562017-01-16 20:41:20 +090013466 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013467}
13468
INADA Naoki3ae20562017-01-16 20:41:20 +090013469/*[clinic input]
13470str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013471
INADA Naoki3ae20562017-01-16 20:41:20 +090013472Partition the string into three parts using the given separator.
13473
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013474This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013475the separator is found, returns a 3-tuple containing the part before the
13476separator, the separator itself, and the part after it.
13477
13478If the separator is not found, returns a 3-tuple containing two empty strings
13479and the original string.
13480[clinic start generated code]*/
13481
13482static PyObject *
13483unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013484/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013485{
INADA Naoki3ae20562017-01-16 20:41:20 +090013486 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013487}
13488
Alexander Belopolsky40018472011-02-26 01:02:56 +000013489PyObject *
13490PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013491{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013492 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013493 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013494
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013495 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013496}
13497
INADA Naoki3ae20562017-01-16 20:41:20 +090013498/*[clinic input]
13499str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013500
INADA Naoki3ae20562017-01-16 20:41:20 +090013501Return a list of the words in the string, using sep as the delimiter string.
13502
13503Splits are done starting at the end of the string and working to the front.
13504[clinic start generated code]*/
13505
13506static PyObject *
13507unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13508/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013509{
INADA Naoki3ae20562017-01-16 20:41:20 +090013510 if (sep == Py_None)
13511 return rsplit(self, NULL, maxsplit);
13512 if (PyUnicode_Check(sep))
13513 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013514
Victor Stinner998b8062018-09-12 00:23:25 +020013515 PyErr_Format(PyExc_TypeError,
13516 "must be str or None, not %.100s",
13517 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013518 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013519}
13520
INADA Naoki3ae20562017-01-16 20:41:20 +090013521/*[clinic input]
13522str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013524 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013525
13526Return a list of the lines in the string, breaking at line boundaries.
13527
13528Line breaks are not included in the resulting list unless keepends is given and
13529true.
13530[clinic start generated code]*/
13531
13532static PyObject *
13533unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013534/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013535{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013536 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013537}
13538
13539static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013540PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013542 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013543}
13544
INADA Naoki3ae20562017-01-16 20:41:20 +090013545/*[clinic input]
13546str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013547
INADA Naoki3ae20562017-01-16 20:41:20 +090013548Convert uppercase characters to lowercase and lowercase characters to uppercase.
13549[clinic start generated code]*/
13550
13551static PyObject *
13552unicode_swapcase_impl(PyObject *self)
13553/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013554{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013555 if (PyUnicode_READY(self) == -1)
13556 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013557 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013558}
13559
Larry Hastings61272b72014-01-07 12:41:53 -080013560/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013561
Larry Hastings31826802013-10-19 00:09:25 -070013562@staticmethod
13563str.maketrans as unicode_maketrans
13564
13565 x: object
13566
13567 y: unicode=NULL
13568
13569 z: unicode=NULL
13570
13571 /
13572
13573Return a translation table usable for str.translate().
13574
13575If there is only one argument, it must be a dictionary mapping Unicode
13576ordinals (integers) or characters to Unicode ordinals, strings or None.
13577Character keys will be then converted to ordinals.
13578If there are two arguments, they must be strings of equal length, and
13579in the resulting dictionary, each character in x will be mapped to the
13580character at the same position in y. If there is a third argument, it
13581must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013582[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013583
Larry Hastings31826802013-10-19 00:09:25 -070013584static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013585unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013586/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013587{
Georg Brandlceee0772007-11-27 23:48:05 +000013588 PyObject *new = NULL, *key, *value;
13589 Py_ssize_t i = 0;
13590 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013591
Georg Brandlceee0772007-11-27 23:48:05 +000013592 new = PyDict_New();
13593 if (!new)
13594 return NULL;
13595 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013597 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598
Georg Brandlceee0772007-11-27 23:48:05 +000013599 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013600 if (!PyUnicode_Check(x)) {
13601 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13602 "be a string if there is a second argument");
13603 goto err;
13604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013606 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13607 "arguments must have equal length");
13608 goto err;
13609 }
13610 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 x_kind = PyUnicode_KIND(x);
13612 y_kind = PyUnicode_KIND(y);
13613 x_data = PyUnicode_DATA(x);
13614 y_data = PyUnicode_DATA(y);
13615 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13616 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013617 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013618 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013619 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013620 if (!value) {
13621 Py_DECREF(key);
13622 goto err;
13623 }
Georg Brandlceee0772007-11-27 23:48:05 +000013624 res = PyDict_SetItem(new, key, value);
13625 Py_DECREF(key);
13626 Py_DECREF(value);
13627 if (res < 0)
13628 goto err;
13629 }
13630 /* create entries for deleting chars in z */
13631 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013632 z_kind = PyUnicode_KIND(z);
13633 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013634 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013635 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013636 if (!key)
13637 goto err;
13638 res = PyDict_SetItem(new, key, Py_None);
13639 Py_DECREF(key);
13640 if (res < 0)
13641 goto err;
13642 }
13643 }
13644 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013645 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013646 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647
Georg Brandlceee0772007-11-27 23:48:05 +000013648 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013649 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013650 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13651 "to maketrans it must be a dict");
13652 goto err;
13653 }
13654 /* copy entries into the new dict, converting string keys to int keys */
13655 while (PyDict_Next(x, &i, &key, &value)) {
13656 if (PyUnicode_Check(key)) {
13657 /* convert string keys to integer keys */
13658 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013659 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013660 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13661 "table must be of length 1");
13662 goto err;
13663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013664 kind = PyUnicode_KIND(key);
13665 data = PyUnicode_DATA(key);
13666 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013667 if (!newkey)
13668 goto err;
13669 res = PyDict_SetItem(new, newkey, value);
13670 Py_DECREF(newkey);
13671 if (res < 0)
13672 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013673 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013674 /* just keep integer keys */
13675 if (PyDict_SetItem(new, key, value) < 0)
13676 goto err;
13677 } else {
13678 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13679 "be strings or integers");
13680 goto err;
13681 }
13682 }
13683 }
13684 return new;
13685 err:
13686 Py_DECREF(new);
13687 return NULL;
13688}
13689
INADA Naoki3ae20562017-01-16 20:41:20 +090013690/*[clinic input]
13691str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692
INADA Naoki3ae20562017-01-16 20:41:20 +090013693 table: object
13694 Translation table, which must be a mapping of Unicode ordinals to
13695 Unicode ordinals, strings, or None.
13696 /
13697
13698Replace each character in the string using the given translation table.
13699
13700The table must implement lookup/indexing via __getitem__, for instance a
13701dictionary or list. If this operation raises LookupError, the character is
13702left untouched. Characters mapped to None are deleted.
13703[clinic start generated code]*/
13704
13705static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013707/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710}
13711
INADA Naoki3ae20562017-01-16 20:41:20 +090013712/*[clinic input]
13713str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013714
INADA Naoki3ae20562017-01-16 20:41:20 +090013715Return a copy of the string converted to uppercase.
13716[clinic start generated code]*/
13717
13718static PyObject *
13719unicode_upper_impl(PyObject *self)
13720/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013721{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013722 if (PyUnicode_READY(self) == -1)
13723 return NULL;
13724 if (PyUnicode_IS_ASCII(self))
13725 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013726 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013727}
13728
INADA Naoki3ae20562017-01-16 20:41:20 +090013729/*[clinic input]
13730str.zfill as unicode_zfill
13731
13732 width: Py_ssize_t
13733 /
13734
13735Pad a numeric string with zeros on the left, to fill a field of the given width.
13736
13737The string is never truncated.
13738[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013739
13740static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013741unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013742/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013743{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013744 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013745 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013746 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013747 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748 Py_UCS4 chr;
13749
Benjamin Petersonbac79492012-01-14 13:34:47 -050013750 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013752
Victor Stinnerc4b49542011-12-11 22:44:26 +010013753 if (PyUnicode_GET_LENGTH(self) >= width)
13754 return unicode_result_unchanged(self);
13755
13756 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757
13758 u = pad(self, fill, 0, '0');
13759
Walter Dörwald068325e2002-04-15 13:36:47 +000013760 if (u == NULL)
13761 return NULL;
13762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013763 kind = PyUnicode_KIND(u);
13764 data = PyUnicode_DATA(u);
13765 chr = PyUnicode_READ(kind, data, fill);
13766
13767 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013768 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013769 PyUnicode_WRITE(kind, data, 0, chr);
13770 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013771 }
13772
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013773 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013774 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013775}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013776
13777#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013778static PyObject *
13779unicode__decimal2ascii(PyObject *self)
13780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013782}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013783#endif
13784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013785PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013786 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013787\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013788Return True if S starts with the specified prefix, False otherwise.\n\
13789With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013790With optional end, stop comparing S at that position.\n\
13791prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792
13793static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013794unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013796{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013797 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013798 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013799 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013800 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013801 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013802
Jesus Ceaac451502011-04-20 17:09:23 +020013803 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013804 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013805 if (PyTuple_Check(subobj)) {
13806 Py_ssize_t i;
13807 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013808 substring = PyTuple_GET_ITEM(subobj, i);
13809 if (!PyUnicode_Check(substring)) {
13810 PyErr_Format(PyExc_TypeError,
13811 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013812 "not %.100s",
13813 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013814 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013815 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013816 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013817 if (result == -1)
13818 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013819 if (result) {
13820 Py_RETURN_TRUE;
13821 }
13822 }
13823 /* nothing matched */
13824 Py_RETURN_FALSE;
13825 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013826 if (!PyUnicode_Check(subobj)) {
13827 PyErr_Format(PyExc_TypeError,
13828 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013829 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013831 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013832 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013833 if (result == -1)
13834 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013835 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836}
13837
13838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013839PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013841\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013842Return True if S ends with the specified suffix, False otherwise.\n\
13843With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013844With optional end, stop comparing S at that position.\n\
13845suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013846
13847static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013848unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013849 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013850{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013851 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013852 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013853 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013854 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013855 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013856
Jesus Ceaac451502011-04-20 17:09:23 +020013857 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013858 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013859 if (PyTuple_Check(subobj)) {
13860 Py_ssize_t i;
13861 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013862 substring = PyTuple_GET_ITEM(subobj, i);
13863 if (!PyUnicode_Check(substring)) {
13864 PyErr_Format(PyExc_TypeError,
13865 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013866 "not %.100s",
13867 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013869 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013870 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013871 if (result == -1)
13872 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013873 if (result) {
13874 Py_RETURN_TRUE;
13875 }
13876 }
13877 Py_RETURN_FALSE;
13878 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013879 if (!PyUnicode_Check(subobj)) {
13880 PyErr_Format(PyExc_TypeError,
13881 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013882 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013883 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013884 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013885 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013886 if (result == -1)
13887 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013888 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013889}
13890
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013891static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013892_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013893{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013894 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13895 writer->data = PyUnicode_DATA(writer->buffer);
13896
13897 if (!writer->readonly) {
13898 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013899 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013900 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013901 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013902 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13903 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13904 writer->kind = PyUnicode_WCHAR_KIND;
13905 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13906
Victor Stinner8f674cc2013-04-17 23:02:17 +020013907 /* Copy-on-write mode: set buffer size to 0 so
13908 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13909 * next write. */
13910 writer->size = 0;
13911 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013912}
13913
Victor Stinnerd3f08822012-05-29 12:57:52 +020013914void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013915_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013916{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013917 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013918
13919 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013920 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013921
13922 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13923 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13924 writer->kind = PyUnicode_WCHAR_KIND;
13925 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013926}
13927
Inada Naoki770847a2019-06-24 12:30:24 +090013928// Initialize _PyUnicodeWriter with initial buffer
13929static inline void
13930_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13931{
13932 memset(writer, 0, sizeof(*writer));
13933 writer->buffer = buffer;
13934 _PyUnicodeWriter_Update(writer);
13935 writer->min_length = writer->size;
13936}
13937
Victor Stinnerd3f08822012-05-29 12:57:52 +020013938int
13939_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13940 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013941{
13942 Py_ssize_t newlen;
13943 PyObject *newbuffer;
13944
Victor Stinner2740e462016-09-06 16:58:36 -070013945 assert(maxchar <= MAX_UNICODE);
13946
Victor Stinnerca9381e2015-09-22 00:58:32 +020013947 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013948 assert((maxchar > writer->maxchar && length >= 0)
13949 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013950
Victor Stinner202fdca2012-05-07 12:47:02 +020013951 if (length > PY_SSIZE_T_MAX - writer->pos) {
13952 PyErr_NoMemory();
13953 return -1;
13954 }
13955 newlen = writer->pos + length;
13956
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013957 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013958
Victor Stinnerd3f08822012-05-29 12:57:52 +020013959 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013960 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013961 if (writer->overallocate
13962 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13963 /* overallocate to limit the number of realloc() */
13964 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013965 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013966 if (newlen < writer->min_length)
13967 newlen = writer->min_length;
13968
Victor Stinnerd3f08822012-05-29 12:57:52 +020013969 writer->buffer = PyUnicode_New(newlen, maxchar);
13970 if (writer->buffer == NULL)
13971 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013972 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013973 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013974 if (writer->overallocate
13975 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13976 /* overallocate to limit the number of realloc() */
13977 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013978 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013979 if (newlen < writer->min_length)
13980 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013981
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013982 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013983 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013984 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013985 newbuffer = PyUnicode_New(newlen, maxchar);
13986 if (newbuffer == NULL)
13987 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013988 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13989 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013990 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013991 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013992 }
13993 else {
13994 newbuffer = resize_compact(writer->buffer, newlen);
13995 if (newbuffer == NULL)
13996 return -1;
13997 }
13998 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013999 }
14000 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014001 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014002 newbuffer = PyUnicode_New(writer->size, maxchar);
14003 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014004 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014005 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14006 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014007 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014008 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014009 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014010 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014011
14012#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014013}
14014
Victor Stinnerca9381e2015-09-22 00:58:32 +020014015int
14016_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14017 enum PyUnicode_Kind kind)
14018{
14019 Py_UCS4 maxchar;
14020
14021 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14022 assert(writer->kind < kind);
14023
14024 switch (kind)
14025 {
14026 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14027 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14028 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14029 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014030 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014031 }
14032
14033 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14034}
14035
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014036static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014037_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014038{
Victor Stinner2740e462016-09-06 16:58:36 -070014039 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014040 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14041 return -1;
14042 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14043 writer->pos++;
14044 return 0;
14045}
14046
14047int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014048_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14049{
14050 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14051}
14052
14053int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014054_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14055{
14056 Py_UCS4 maxchar;
14057 Py_ssize_t len;
14058
14059 if (PyUnicode_READY(str) == -1)
14060 return -1;
14061 len = PyUnicode_GET_LENGTH(str);
14062 if (len == 0)
14063 return 0;
14064 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14065 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014066 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014067 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014068 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014069 Py_INCREF(str);
14070 writer->buffer = str;
14071 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014072 writer->pos += len;
14073 return 0;
14074 }
14075 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14076 return -1;
14077 }
14078 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14079 str, 0, len);
14080 writer->pos += len;
14081 return 0;
14082}
14083
Victor Stinnere215d962012-10-06 23:03:36 +020014084int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014085_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14086 Py_ssize_t start, Py_ssize_t end)
14087{
14088 Py_UCS4 maxchar;
14089 Py_ssize_t len;
14090
14091 if (PyUnicode_READY(str) == -1)
14092 return -1;
14093
14094 assert(0 <= start);
14095 assert(end <= PyUnicode_GET_LENGTH(str));
14096 assert(start <= end);
14097
14098 if (end == 0)
14099 return 0;
14100
14101 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14102 return _PyUnicodeWriter_WriteStr(writer, str);
14103
14104 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14105 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14106 else
14107 maxchar = writer->maxchar;
14108 len = end - start;
14109
14110 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14111 return -1;
14112
14113 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14114 str, start, len);
14115 writer->pos += len;
14116 return 0;
14117}
14118
14119int
Victor Stinner4a587072013-11-19 12:54:53 +010014120_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14121 const char *ascii, Py_ssize_t len)
14122{
14123 if (len == -1)
14124 len = strlen(ascii);
14125
Andy Lestere6be9b52020-02-11 20:28:35 -060014126 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014127
14128 if (writer->buffer == NULL && !writer->overallocate) {
14129 PyObject *str;
14130
14131 str = _PyUnicode_FromASCII(ascii, len);
14132 if (str == NULL)
14133 return -1;
14134
14135 writer->readonly = 1;
14136 writer->buffer = str;
14137 _PyUnicodeWriter_Update(writer);
14138 writer->pos += len;
14139 return 0;
14140 }
14141
14142 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14143 return -1;
14144
14145 switch (writer->kind)
14146 {
14147 case PyUnicode_1BYTE_KIND:
14148 {
14149 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14150 Py_UCS1 *data = writer->data;
14151
Christian Heimesf051e432016-09-13 20:22:02 +020014152 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014153 break;
14154 }
14155 case PyUnicode_2BYTE_KIND:
14156 {
14157 _PyUnicode_CONVERT_BYTES(
14158 Py_UCS1, Py_UCS2,
14159 ascii, ascii + len,
14160 (Py_UCS2 *)writer->data + writer->pos);
14161 break;
14162 }
14163 case PyUnicode_4BYTE_KIND:
14164 {
14165 _PyUnicode_CONVERT_BYTES(
14166 Py_UCS1, Py_UCS4,
14167 ascii, ascii + len,
14168 (Py_UCS4 *)writer->data + writer->pos);
14169 break;
14170 }
14171 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014172 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014173 }
14174
14175 writer->pos += len;
14176 return 0;
14177}
14178
14179int
14180_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14181 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014182{
14183 Py_UCS4 maxchar;
14184
Andy Lestere6be9b52020-02-11 20:28:35 -060014185 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014186 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14187 return -1;
14188 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14189 writer->pos += len;
14190 return 0;
14191}
14192
Victor Stinnerd3f08822012-05-29 12:57:52 +020014193PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014194_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014195{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014196 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014197
Victor Stinnerd3f08822012-05-29 12:57:52 +020014198 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014199 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014200 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014201 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014202
14203 str = writer->buffer;
14204 writer->buffer = NULL;
14205
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014206 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014207 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14208 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014209 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014210
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014211 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14212 PyObject *str2;
14213 str2 = resize_compact(str, writer->pos);
14214 if (str2 == NULL) {
14215 Py_DECREF(str);
14216 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014217 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014218 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014219 }
14220
Victor Stinner15a0bd32013-07-08 22:29:55 +020014221 assert(_PyUnicode_CheckConsistency(str, 1));
14222 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014223}
14224
Victor Stinnerd3f08822012-05-29 12:57:52 +020014225void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014226_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014227{
14228 Py_CLEAR(writer->buffer);
14229}
14230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014231#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014232
14233PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014234 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014235\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014236Return a formatted version of S, using substitutions from args and kwargs.\n\
14237The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014238
Eric Smith27bbca62010-11-04 17:06:58 +000014239PyDoc_STRVAR(format_map__doc__,
14240 "S.format_map(mapping) -> str\n\
14241\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014242Return a formatted version of S, using substitutions from mapping.\n\
14243The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014244
INADA Naoki3ae20562017-01-16 20:41:20 +090014245/*[clinic input]
14246str.__format__ as unicode___format__
14247
14248 format_spec: unicode
14249 /
14250
14251Return a formatted version of the string as described by format_spec.
14252[clinic start generated code]*/
14253
Eric Smith4a7d76d2008-05-30 18:10:19 +000014254static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014255unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014256/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014257{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014258 _PyUnicodeWriter writer;
14259 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014260
Victor Stinnerd3f08822012-05-29 12:57:52 +020014261 if (PyUnicode_READY(self) == -1)
14262 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014263 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014264 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14265 self, format_spec, 0,
14266 PyUnicode_GET_LENGTH(format_spec));
14267 if (ret == -1) {
14268 _PyUnicodeWriter_Dealloc(&writer);
14269 return NULL;
14270 }
14271 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014272}
14273
INADA Naoki3ae20562017-01-16 20:41:20 +090014274/*[clinic input]
14275str.__sizeof__ as unicode_sizeof
14276
14277Return the size of the string in memory, in bytes.
14278[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014279
14280static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014281unicode_sizeof_impl(PyObject *self)
14282/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014284 Py_ssize_t size;
14285
14286 /* If it's a compact object, account for base structure +
14287 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014288 if (PyUnicode_IS_COMPACT_ASCII(self))
14289 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14290 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014291 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014292 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014293 else {
14294 /* If it is a two-block object, account for base object, and
14295 for character block if present. */
14296 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014297 if (_PyUnicode_DATA_ANY(self))
14298 size += (PyUnicode_GET_LENGTH(self) + 1) *
14299 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014300 }
14301 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014302 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014303 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14304 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14305 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14306 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014307
14308 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014309}
14310
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014311static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014312unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014313{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014314 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014315 if (!copy)
14316 return NULL;
14317 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014318}
14319
Guido van Rossumd57fd912000-03-10 22:53:23 +000014320static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014321 UNICODE_ENCODE_METHODDEF
14322 UNICODE_REPLACE_METHODDEF
14323 UNICODE_SPLIT_METHODDEF
14324 UNICODE_RSPLIT_METHODDEF
14325 UNICODE_JOIN_METHODDEF
14326 UNICODE_CAPITALIZE_METHODDEF
14327 UNICODE_CASEFOLD_METHODDEF
14328 UNICODE_TITLE_METHODDEF
14329 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014330 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014331 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014332 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014333 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014334 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014335 UNICODE_LJUST_METHODDEF
14336 UNICODE_LOWER_METHODDEF
14337 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014338 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14339 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014340 UNICODE_RJUST_METHODDEF
14341 UNICODE_RSTRIP_METHODDEF
14342 UNICODE_RPARTITION_METHODDEF
14343 UNICODE_SPLITLINES_METHODDEF
14344 UNICODE_STRIP_METHODDEF
14345 UNICODE_SWAPCASE_METHODDEF
14346 UNICODE_TRANSLATE_METHODDEF
14347 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014348 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14349 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014350 UNICODE_REMOVEPREFIX_METHODDEF
14351 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014352 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014353 UNICODE_ISLOWER_METHODDEF
14354 UNICODE_ISUPPER_METHODDEF
14355 UNICODE_ISTITLE_METHODDEF
14356 UNICODE_ISSPACE_METHODDEF
14357 UNICODE_ISDECIMAL_METHODDEF
14358 UNICODE_ISDIGIT_METHODDEF
14359 UNICODE_ISNUMERIC_METHODDEF
14360 UNICODE_ISALPHA_METHODDEF
14361 UNICODE_ISALNUM_METHODDEF
14362 UNICODE_ISIDENTIFIER_METHODDEF
14363 UNICODE_ISPRINTABLE_METHODDEF
14364 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014365 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014366 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014367 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014368 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014369 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014370#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014371 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014372 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014373#endif
14374
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014375 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014376 {NULL, NULL}
14377};
14378
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014379static PyObject *
14380unicode_mod(PyObject *v, PyObject *w)
14381{
Brian Curtindfc80e32011-08-10 20:28:54 -050014382 if (!PyUnicode_Check(v))
14383 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014384 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014385}
14386
14387static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014388 0, /*nb_add*/
14389 0, /*nb_subtract*/
14390 0, /*nb_multiply*/
14391 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014392};
14393
Guido van Rossumd57fd912000-03-10 22:53:23 +000014394static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014395 (lenfunc) unicode_length, /* sq_length */
14396 PyUnicode_Concat, /* sq_concat */
14397 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14398 (ssizeargfunc) unicode_getitem, /* sq_item */
14399 0, /* sq_slice */
14400 0, /* sq_ass_item */
14401 0, /* sq_ass_slice */
14402 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014403};
14404
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014405static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014406unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014408 if (PyUnicode_READY(self) == -1)
14409 return NULL;
14410
Victor Stinnera15e2602020-04-08 02:01:56 +020014411 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014412 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014413 if (i == -1 && PyErr_Occurred())
14414 return NULL;
14415 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014416 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014417 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014418 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014419 Py_ssize_t start, stop, step, slicelength, i;
14420 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014421 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014422 const void *src_data;
14423 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014424 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014425 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014426
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014427 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014428 return NULL;
14429 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014430 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14431 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014432
14433 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014434 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014435 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014436 slicelength == PyUnicode_GET_LENGTH(self)) {
14437 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014438 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014439 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014440 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014441 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014442 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014443 src_kind = PyUnicode_KIND(self);
14444 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014445 if (!PyUnicode_IS_ASCII(self)) {
14446 kind_limit = kind_maxchar_limit(src_kind);
14447 max_char = 0;
14448 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14449 ch = PyUnicode_READ(src_kind, src_data, cur);
14450 if (ch > max_char) {
14451 max_char = ch;
14452 if (max_char >= kind_limit)
14453 break;
14454 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014455 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014456 }
Victor Stinner55c99112011-10-13 01:17:06 +020014457 else
14458 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014459 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014460 if (result == NULL)
14461 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014462 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014463 dest_data = PyUnicode_DATA(result);
14464
14465 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014466 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14467 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014468 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014469 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014470 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014471 } else {
14472 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14473 return NULL;
14474 }
14475}
14476
14477static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 (lenfunc)unicode_length, /* mp_length */
14479 (binaryfunc)unicode_subscript, /* mp_subscript */
14480 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014481};
14482
Guido van Rossumd57fd912000-03-10 22:53:23 +000014483
Guido van Rossumd57fd912000-03-10 22:53:23 +000014484/* Helpers for PyUnicode_Format() */
14485
Victor Stinnera47082312012-10-04 02:19:54 +020014486struct unicode_formatter_t {
14487 PyObject *args;
14488 int args_owned;
14489 Py_ssize_t arglen, argidx;
14490 PyObject *dict;
14491
14492 enum PyUnicode_Kind fmtkind;
14493 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014494 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014495 PyObject *fmtstr;
14496
14497 _PyUnicodeWriter writer;
14498};
14499
14500struct unicode_format_arg_t {
14501 Py_UCS4 ch;
14502 int flags;
14503 Py_ssize_t width;
14504 int prec;
14505 int sign;
14506};
14507
Guido van Rossumd57fd912000-03-10 22:53:23 +000014508static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014509unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014510{
Victor Stinnera47082312012-10-04 02:19:54 +020014511 Py_ssize_t argidx = ctx->argidx;
14512
14513 if (argidx < ctx->arglen) {
14514 ctx->argidx++;
14515 if (ctx->arglen < 0)
14516 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014517 else
Victor Stinnera47082312012-10-04 02:19:54 +020014518 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014519 }
14520 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014521 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014522 return NULL;
14523}
14524
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014525/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014526
Victor Stinnera47082312012-10-04 02:19:54 +020014527/* Format a float into the writer if the writer is not NULL, or into *p_output
14528 otherwise.
14529
14530 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014531static int
Victor Stinnera47082312012-10-04 02:19:54 +020014532formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14533 PyObject **p_output,
14534 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014535{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014536 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014537 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014538 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014539 int prec;
14540 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014541
Guido van Rossumd57fd912000-03-10 22:53:23 +000014542 x = PyFloat_AsDouble(v);
14543 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014544 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014545
Victor Stinnera47082312012-10-04 02:19:54 +020014546 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014547 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014548 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014549
Victor Stinnera47082312012-10-04 02:19:54 +020014550 if (arg->flags & F_ALT)
14551 dtoa_flags = Py_DTSF_ALT;
14552 else
14553 dtoa_flags = 0;
14554 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014555 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014556 return -1;
14557 len = strlen(p);
14558 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014559 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014560 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014561 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014562 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014563 }
14564 else
14565 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014566 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014567 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014568}
14569
Victor Stinnerd0880d52012-04-27 23:40:13 +020014570/* formatlong() emulates the format codes d, u, o, x and X, and
14571 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14572 * Python's regular ints.
14573 * Return value: a new PyUnicodeObject*, or NULL if error.
14574 * The output string is of the form
14575 * "-"? ("0x" | "0X")? digit+
14576 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14577 * set in flags. The case of hex digits will be correct,
14578 * There will be at least prec digits, zero-filled on the left if
14579 * necessary to get that many.
14580 * val object to be converted
14581 * flags bitmask of format flags; only F_ALT is looked at
14582 * prec minimum number of digits; 0-fill on left if needed
14583 * type a character in [duoxX]; u acts the same as d
14584 *
14585 * CAUTION: o, x and X conversions on regular ints can never
14586 * produce a '-' sign, but can for Python's unbounded ints.
14587 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014588PyObject *
14589_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014590{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014591 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014592 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014593 Py_ssize_t i;
14594 int sign; /* 1 if '-', else 0 */
14595 int len; /* number of characters */
14596 Py_ssize_t llen;
14597 int numdigits; /* len == numnondigits + numdigits */
14598 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014599
Victor Stinnerd0880d52012-04-27 23:40:13 +020014600 /* Avoid exceeding SSIZE_T_MAX */
14601 if (prec > INT_MAX-3) {
14602 PyErr_SetString(PyExc_OverflowError,
14603 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014604 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014605 }
14606
14607 assert(PyLong_Check(val));
14608
14609 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014610 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014611 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014612 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014613 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014614 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014615 /* int and int subclasses should print numerically when a numeric */
14616 /* format code is used (see issue18780) */
14617 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014618 break;
14619 case 'o':
14620 numnondigits = 2;
14621 result = PyNumber_ToBase(val, 8);
14622 break;
14623 case 'x':
14624 case 'X':
14625 numnondigits = 2;
14626 result = PyNumber_ToBase(val, 16);
14627 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014628 }
14629 if (!result)
14630 return NULL;
14631
14632 assert(unicode_modifiable(result));
14633 assert(PyUnicode_IS_READY(result));
14634 assert(PyUnicode_IS_ASCII(result));
14635
14636 /* To modify the string in-place, there can only be one reference. */
14637 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014638 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014639 PyErr_BadInternalCall();
14640 return NULL;
14641 }
14642 buf = PyUnicode_DATA(result);
14643 llen = PyUnicode_GET_LENGTH(result);
14644 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014645 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014646 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014647 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014648 return NULL;
14649 }
14650 len = (int)llen;
14651 sign = buf[0] == '-';
14652 numnondigits += sign;
14653 numdigits = len - numnondigits;
14654 assert(numdigits > 0);
14655
14656 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014657 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014658 (type == 'o' || type == 'x' || type == 'X'))) {
14659 assert(buf[sign] == '0');
14660 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14661 buf[sign+1] == 'o');
14662 numnondigits -= 2;
14663 buf += 2;
14664 len -= 2;
14665 if (sign)
14666 buf[0] = '-';
14667 assert(len == numnondigits + numdigits);
14668 assert(numdigits > 0);
14669 }
14670
14671 /* Fill with leading zeroes to meet minimum width. */
14672 if (prec > numdigits) {
14673 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14674 numnondigits + prec);
14675 char *b1;
14676 if (!r1) {
14677 Py_DECREF(result);
14678 return NULL;
14679 }
14680 b1 = PyBytes_AS_STRING(r1);
14681 for (i = 0; i < numnondigits; ++i)
14682 *b1++ = *buf++;
14683 for (i = 0; i < prec - numdigits; i++)
14684 *b1++ = '0';
14685 for (i = 0; i < numdigits; i++)
14686 *b1++ = *buf++;
14687 *b1 = '\0';
14688 Py_DECREF(result);
14689 result = r1;
14690 buf = PyBytes_AS_STRING(result);
14691 len = numnondigits + prec;
14692 }
14693
14694 /* Fix up case for hex conversions. */
14695 if (type == 'X') {
14696 /* Need to convert all lower case letters to upper case.
14697 and need to convert 0x to 0X (and -0x to -0X). */
14698 for (i = 0; i < len; i++)
14699 if (buf[i] >= 'a' && buf[i] <= 'x')
14700 buf[i] -= 'a'-'A';
14701 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014702 if (!PyUnicode_Check(result)
14703 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014704 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014705 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014706 Py_DECREF(result);
14707 result = unicode;
14708 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014709 else if (len != PyUnicode_GET_LENGTH(result)) {
14710 if (PyUnicode_Resize(&result, len) < 0)
14711 Py_CLEAR(result);
14712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014713 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014714}
14715
Ethan Furmandf3ed242014-01-05 06:50:30 -080014716/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014717 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014718 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014719 * -1 and raise an exception on error */
14720static int
Victor Stinnera47082312012-10-04 02:19:54 +020014721mainformatlong(PyObject *v,
14722 struct unicode_format_arg_t *arg,
14723 PyObject **p_output,
14724 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014725{
14726 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014727 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014728
14729 if (!PyNumber_Check(v))
14730 goto wrongtype;
14731
Ethan Furman9ab74802014-03-21 06:38:46 -070014732 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014733 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014734 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014735 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014736 }
14737 else {
14738 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014739 }
14740 if (iobj == NULL ) {
14741 if (PyErr_ExceptionMatches(PyExc_TypeError))
14742 goto wrongtype;
14743 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014744 }
14745 assert(PyLong_Check(iobj));
14746 }
14747 else {
14748 iobj = v;
14749 Py_INCREF(iobj);
14750 }
14751
14752 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014753 && arg->width == -1 && arg->prec == -1
14754 && !(arg->flags & (F_SIGN | F_BLANK))
14755 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014756 {
14757 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014758 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014759 int base;
14760
Victor Stinnera47082312012-10-04 02:19:54 +020014761 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014762 {
14763 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014764 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014765 case 'd':
14766 case 'i':
14767 case 'u':
14768 base = 10;
14769 break;
14770 case 'o':
14771 base = 8;
14772 break;
14773 case 'x':
14774 case 'X':
14775 base = 16;
14776 break;
14777 }
14778
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014779 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14780 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014781 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014782 }
14783 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014784 return 1;
14785 }
14786
Ethan Furmanb95b5612015-01-23 20:05:18 -080014787 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014788 Py_DECREF(iobj);
14789 if (res == NULL)
14790 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014791 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014792 return 0;
14793
14794wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014795 switch(type)
14796 {
14797 case 'o':
14798 case 'x':
14799 case 'X':
14800 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014801 "%%%c format: an integer is required, "
14802 "not %.200s",
14803 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014804 break;
14805 default:
14806 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014807 "%%%c format: a number is required, "
14808 "not %.200s",
14809 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014810 break;
14811 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014812 return -1;
14813}
14814
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014815static Py_UCS4
14816formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014817{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014818 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014819 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014820 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014821 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014822 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014823 goto onError;
14824 }
14825 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014826 int overflow;
14827 long x = PyLong_AsLongAndOverflow(v, &overflow);
14828 if (x == -1 && PyErr_Occurred()) {
14829 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014830 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014831 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014832 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014833 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014834
Victor Stinner8faf8212011-12-08 22:14:11 +010014835 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014836 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014837 PyErr_SetString(PyExc_OverflowError,
14838 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014839 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014840 }
14841
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014842 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014843 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014844
Benjamin Peterson29060642009-01-31 22:14:21 +000014845 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014846 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014847 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014848 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014849}
14850
Victor Stinnera47082312012-10-04 02:19:54 +020014851/* Parse options of an argument: flags, width, precision.
14852 Handle also "%(name)" syntax.
14853
14854 Return 0 if the argument has been formatted into arg->str.
14855 Return 1 if the argument has been written into ctx->writer,
14856 Raise an exception and return -1 on error. */
14857static int
14858unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14859 struct unicode_format_arg_t *arg)
14860{
14861#define FORMAT_READ(ctx) \
14862 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14863
14864 PyObject *v;
14865
Victor Stinnera47082312012-10-04 02:19:54 +020014866 if (arg->ch == '(') {
14867 /* Get argument value from a dictionary. Example: "%(name)s". */
14868 Py_ssize_t keystart;
14869 Py_ssize_t keylen;
14870 PyObject *key;
14871 int pcount = 1;
14872
14873 if (ctx->dict == NULL) {
14874 PyErr_SetString(PyExc_TypeError,
14875 "format requires a mapping");
14876 return -1;
14877 }
14878 ++ctx->fmtpos;
14879 --ctx->fmtcnt;
14880 keystart = ctx->fmtpos;
14881 /* Skip over balanced parentheses */
14882 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14883 arg->ch = FORMAT_READ(ctx);
14884 if (arg->ch == ')')
14885 --pcount;
14886 else if (arg->ch == '(')
14887 ++pcount;
14888 ctx->fmtpos++;
14889 }
14890 keylen = ctx->fmtpos - keystart - 1;
14891 if (ctx->fmtcnt < 0 || pcount > 0) {
14892 PyErr_SetString(PyExc_ValueError,
14893 "incomplete format key");
14894 return -1;
14895 }
14896 key = PyUnicode_Substring(ctx->fmtstr,
14897 keystart, keystart + keylen);
14898 if (key == NULL)
14899 return -1;
14900 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014901 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014902 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014903 }
14904 ctx->args = PyObject_GetItem(ctx->dict, key);
14905 Py_DECREF(key);
14906 if (ctx->args == NULL)
14907 return -1;
14908 ctx->args_owned = 1;
14909 ctx->arglen = -1;
14910 ctx->argidx = -2;
14911 }
14912
14913 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014914 while (--ctx->fmtcnt >= 0) {
14915 arg->ch = FORMAT_READ(ctx);
14916 ctx->fmtpos++;
14917 switch (arg->ch) {
14918 case '-': arg->flags |= F_LJUST; continue;
14919 case '+': arg->flags |= F_SIGN; continue;
14920 case ' ': arg->flags |= F_BLANK; continue;
14921 case '#': arg->flags |= F_ALT; continue;
14922 case '0': arg->flags |= F_ZERO; continue;
14923 }
14924 break;
14925 }
14926
14927 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014928 if (arg->ch == '*') {
14929 v = unicode_format_getnextarg(ctx);
14930 if (v == NULL)
14931 return -1;
14932 if (!PyLong_Check(v)) {
14933 PyErr_SetString(PyExc_TypeError,
14934 "* wants int");
14935 return -1;
14936 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014937 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014938 if (arg->width == -1 && PyErr_Occurred())
14939 return -1;
14940 if (arg->width < 0) {
14941 arg->flags |= F_LJUST;
14942 arg->width = -arg->width;
14943 }
14944 if (--ctx->fmtcnt >= 0) {
14945 arg->ch = FORMAT_READ(ctx);
14946 ctx->fmtpos++;
14947 }
14948 }
14949 else if (arg->ch >= '0' && arg->ch <= '9') {
14950 arg->width = arg->ch - '0';
14951 while (--ctx->fmtcnt >= 0) {
14952 arg->ch = FORMAT_READ(ctx);
14953 ctx->fmtpos++;
14954 if (arg->ch < '0' || arg->ch > '9')
14955 break;
14956 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14957 mixing signed and unsigned comparison. Since arg->ch is between
14958 '0' and '9', casting to int is safe. */
14959 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14960 PyErr_SetString(PyExc_ValueError,
14961 "width too big");
14962 return -1;
14963 }
14964 arg->width = arg->width*10 + (arg->ch - '0');
14965 }
14966 }
14967
14968 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014969 if (arg->ch == '.') {
14970 arg->prec = 0;
14971 if (--ctx->fmtcnt >= 0) {
14972 arg->ch = FORMAT_READ(ctx);
14973 ctx->fmtpos++;
14974 }
14975 if (arg->ch == '*') {
14976 v = unicode_format_getnextarg(ctx);
14977 if (v == NULL)
14978 return -1;
14979 if (!PyLong_Check(v)) {
14980 PyErr_SetString(PyExc_TypeError,
14981 "* wants int");
14982 return -1;
14983 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014984 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014985 if (arg->prec == -1 && PyErr_Occurred())
14986 return -1;
14987 if (arg->prec < 0)
14988 arg->prec = 0;
14989 if (--ctx->fmtcnt >= 0) {
14990 arg->ch = FORMAT_READ(ctx);
14991 ctx->fmtpos++;
14992 }
14993 }
14994 else if (arg->ch >= '0' && arg->ch <= '9') {
14995 arg->prec = arg->ch - '0';
14996 while (--ctx->fmtcnt >= 0) {
14997 arg->ch = FORMAT_READ(ctx);
14998 ctx->fmtpos++;
14999 if (arg->ch < '0' || arg->ch > '9')
15000 break;
15001 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15002 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015003 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015004 return -1;
15005 }
15006 arg->prec = arg->prec*10 + (arg->ch - '0');
15007 }
15008 }
15009 }
15010
15011 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15012 if (ctx->fmtcnt >= 0) {
15013 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15014 if (--ctx->fmtcnt >= 0) {
15015 arg->ch = FORMAT_READ(ctx);
15016 ctx->fmtpos++;
15017 }
15018 }
15019 }
15020 if (ctx->fmtcnt < 0) {
15021 PyErr_SetString(PyExc_ValueError,
15022 "incomplete format");
15023 return -1;
15024 }
15025 return 0;
15026
15027#undef FORMAT_READ
15028}
15029
15030/* Format one argument. Supported conversion specifiers:
15031
15032 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015033 - "i", "d", "u": int or float
15034 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015035 - "e", "E", "f", "F", "g", "G": float
15036 - "c": int or str (1 character)
15037
Victor Stinner8dbd4212012-12-04 09:30:24 +010015038 When possible, the output is written directly into the Unicode writer
15039 (ctx->writer). A string is created when padding is required.
15040
Victor Stinnera47082312012-10-04 02:19:54 +020015041 Return 0 if the argument has been formatted into *p_str,
15042 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015043 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015044static int
15045unicode_format_arg_format(struct unicode_formatter_t *ctx,
15046 struct unicode_format_arg_t *arg,
15047 PyObject **p_str)
15048{
15049 PyObject *v;
15050 _PyUnicodeWriter *writer = &ctx->writer;
15051
15052 if (ctx->fmtcnt == 0)
15053 ctx->writer.overallocate = 0;
15054
Victor Stinnera47082312012-10-04 02:19:54 +020015055 v = unicode_format_getnextarg(ctx);
15056 if (v == NULL)
15057 return -1;
15058
Victor Stinnera47082312012-10-04 02:19:54 +020015059
15060 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015061 case 's':
15062 case 'r':
15063 case 'a':
15064 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15065 /* Fast path */
15066 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15067 return -1;
15068 return 1;
15069 }
15070
15071 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15072 *p_str = v;
15073 Py_INCREF(*p_str);
15074 }
15075 else {
15076 if (arg->ch == 's')
15077 *p_str = PyObject_Str(v);
15078 else if (arg->ch == 'r')
15079 *p_str = PyObject_Repr(v);
15080 else
15081 *p_str = PyObject_ASCII(v);
15082 }
15083 break;
15084
15085 case 'i':
15086 case 'd':
15087 case 'u':
15088 case 'o':
15089 case 'x':
15090 case 'X':
15091 {
15092 int ret = mainformatlong(v, arg, p_str, writer);
15093 if (ret != 0)
15094 return ret;
15095 arg->sign = 1;
15096 break;
15097 }
15098
15099 case 'e':
15100 case 'E':
15101 case 'f':
15102 case 'F':
15103 case 'g':
15104 case 'G':
15105 if (arg->width == -1 && arg->prec == -1
15106 && !(arg->flags & (F_SIGN | F_BLANK)))
15107 {
15108 /* Fast path */
15109 if (formatfloat(v, arg, NULL, writer) == -1)
15110 return -1;
15111 return 1;
15112 }
15113
15114 arg->sign = 1;
15115 if (formatfloat(v, arg, p_str, NULL) == -1)
15116 return -1;
15117 break;
15118
15119 case 'c':
15120 {
15121 Py_UCS4 ch = formatchar(v);
15122 if (ch == (Py_UCS4) -1)
15123 return -1;
15124 if (arg->width == -1 && arg->prec == -1) {
15125 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015126 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015127 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015128 return 1;
15129 }
15130 *p_str = PyUnicode_FromOrdinal(ch);
15131 break;
15132 }
15133
15134 default:
15135 PyErr_Format(PyExc_ValueError,
15136 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015137 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015138 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15139 (int)arg->ch,
15140 ctx->fmtpos - 1);
15141 return -1;
15142 }
15143 if (*p_str == NULL)
15144 return -1;
15145 assert (PyUnicode_Check(*p_str));
15146 return 0;
15147}
15148
15149static int
15150unicode_format_arg_output(struct unicode_formatter_t *ctx,
15151 struct unicode_format_arg_t *arg,
15152 PyObject *str)
15153{
15154 Py_ssize_t len;
15155 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015156 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015157 Py_ssize_t pindex;
15158 Py_UCS4 signchar;
15159 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015160 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015161 Py_ssize_t sublen;
15162 _PyUnicodeWriter *writer = &ctx->writer;
15163 Py_UCS4 fill;
15164
15165 fill = ' ';
15166 if (arg->sign && arg->flags & F_ZERO)
15167 fill = '0';
15168
15169 if (PyUnicode_READY(str) == -1)
15170 return -1;
15171
15172 len = PyUnicode_GET_LENGTH(str);
15173 if ((arg->width == -1 || arg->width <= len)
15174 && (arg->prec == -1 || arg->prec >= len)
15175 && !(arg->flags & (F_SIGN | F_BLANK)))
15176 {
15177 /* Fast path */
15178 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15179 return -1;
15180 return 0;
15181 }
15182
15183 /* Truncate the string for "s", "r" and "a" formats
15184 if the precision is set */
15185 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15186 if (arg->prec >= 0 && len > arg->prec)
15187 len = arg->prec;
15188 }
15189
15190 /* Adjust sign and width */
15191 kind = PyUnicode_KIND(str);
15192 pbuf = PyUnicode_DATA(str);
15193 pindex = 0;
15194 signchar = '\0';
15195 if (arg->sign) {
15196 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15197 if (ch == '-' || ch == '+') {
15198 signchar = ch;
15199 len--;
15200 pindex++;
15201 }
15202 else if (arg->flags & F_SIGN)
15203 signchar = '+';
15204 else if (arg->flags & F_BLANK)
15205 signchar = ' ';
15206 else
15207 arg->sign = 0;
15208 }
15209 if (arg->width < len)
15210 arg->width = len;
15211
15212 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015213 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015214 if (!(arg->flags & F_LJUST)) {
15215 if (arg->sign) {
15216 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015217 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015218 }
15219 else {
15220 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015221 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015222 }
15223 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015224 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15225 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015226 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015227 }
15228
Victor Stinnera47082312012-10-04 02:19:54 +020015229 buflen = arg->width;
15230 if (arg->sign && len == arg->width)
15231 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015232 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015233 return -1;
15234
15235 /* Write the sign if needed */
15236 if (arg->sign) {
15237 if (fill != ' ') {
15238 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15239 writer->pos += 1;
15240 }
15241 if (arg->width > len)
15242 arg->width--;
15243 }
15244
15245 /* Write the numeric prefix for "x", "X" and "o" formats
15246 if the alternate form is used.
15247 For example, write "0x" for the "%#x" format. */
15248 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15249 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15250 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15251 if (fill != ' ') {
15252 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15253 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15254 writer->pos += 2;
15255 pindex += 2;
15256 }
15257 arg->width -= 2;
15258 if (arg->width < 0)
15259 arg->width = 0;
15260 len -= 2;
15261 }
15262
15263 /* Pad left with the fill character if needed */
15264 if (arg->width > len && !(arg->flags & F_LJUST)) {
15265 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015266 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015267 writer->pos += sublen;
15268 arg->width = len;
15269 }
15270
15271 /* If padding with spaces: write sign if needed and/or numeric prefix if
15272 the alternate form is used */
15273 if (fill == ' ') {
15274 if (arg->sign) {
15275 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15276 writer->pos += 1;
15277 }
15278 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15279 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15280 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15281 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15282 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15283 writer->pos += 2;
15284 pindex += 2;
15285 }
15286 }
15287
15288 /* Write characters */
15289 if (len) {
15290 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15291 str, pindex, len);
15292 writer->pos += len;
15293 }
15294
15295 /* Pad right with the fill character if needed */
15296 if (arg->width > len) {
15297 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015298 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015299 writer->pos += sublen;
15300 }
15301 return 0;
15302}
15303
15304/* Helper of PyUnicode_Format(): format one arg.
15305 Return 0 on success, raise an exception and return -1 on error. */
15306static int
15307unicode_format_arg(struct unicode_formatter_t *ctx)
15308{
15309 struct unicode_format_arg_t arg;
15310 PyObject *str;
15311 int ret;
15312
Victor Stinner8dbd4212012-12-04 09:30:24 +010015313 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015314 if (arg.ch == '%') {
15315 ctx->fmtpos++;
15316 ctx->fmtcnt--;
15317 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15318 return -1;
15319 return 0;
15320 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015321 arg.flags = 0;
15322 arg.width = -1;
15323 arg.prec = -1;
15324 arg.sign = 0;
15325 str = NULL;
15326
Victor Stinnera47082312012-10-04 02:19:54 +020015327 ret = unicode_format_arg_parse(ctx, &arg);
15328 if (ret == -1)
15329 return -1;
15330
15331 ret = unicode_format_arg_format(ctx, &arg, &str);
15332 if (ret == -1)
15333 return -1;
15334
15335 if (ret != 1) {
15336 ret = unicode_format_arg_output(ctx, &arg, str);
15337 Py_DECREF(str);
15338 if (ret == -1)
15339 return -1;
15340 }
15341
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015342 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015343 PyErr_SetString(PyExc_TypeError,
15344 "not all arguments converted during string formatting");
15345 return -1;
15346 }
15347 return 0;
15348}
15349
Alexander Belopolsky40018472011-02-26 01:02:56 +000015350PyObject *
15351PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015352{
Victor Stinnera47082312012-10-04 02:19:54 +020015353 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015354
Guido van Rossumd57fd912000-03-10 22:53:23 +000015355 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015356 PyErr_BadInternalCall();
15357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015358 }
Victor Stinnera47082312012-10-04 02:19:54 +020015359
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015360 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015361 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015362
15363 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015364 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15365 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15366 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15367 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015368
Victor Stinner8f674cc2013-04-17 23:02:17 +020015369 _PyUnicodeWriter_Init(&ctx.writer);
15370 ctx.writer.min_length = ctx.fmtcnt + 100;
15371 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015372
Guido van Rossumd57fd912000-03-10 22:53:23 +000015373 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015374 ctx.arglen = PyTuple_Size(args);
15375 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015376 }
15377 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015378 ctx.arglen = -1;
15379 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015380 }
Victor Stinnera47082312012-10-04 02:19:54 +020015381 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015382 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015383 ctx.dict = args;
15384 else
15385 ctx.dict = NULL;
15386 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015387
Victor Stinnera47082312012-10-04 02:19:54 +020015388 while (--ctx.fmtcnt >= 0) {
15389 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015390 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015391
15392 nonfmtpos = ctx.fmtpos++;
15393 while (ctx.fmtcnt >= 0 &&
15394 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15395 ctx.fmtpos++;
15396 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015397 }
Victor Stinnera47082312012-10-04 02:19:54 +020015398 if (ctx.fmtcnt < 0) {
15399 ctx.fmtpos--;
15400 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015401 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015402
Victor Stinnercfc4c132013-04-03 01:48:39 +020015403 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15404 nonfmtpos, ctx.fmtpos) < 0)
15405 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 }
15407 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015408 ctx.fmtpos++;
15409 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015410 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015411 }
15412 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015413
Victor Stinnera47082312012-10-04 02:19:54 +020015414 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015415 PyErr_SetString(PyExc_TypeError,
15416 "not all arguments converted during string formatting");
15417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015418 }
15419
Victor Stinnera47082312012-10-04 02:19:54 +020015420 if (ctx.args_owned) {
15421 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015422 }
Victor Stinnera47082312012-10-04 02:19:54 +020015423 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015424
Benjamin Peterson29060642009-01-31 22:14:21 +000015425 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015426 _PyUnicodeWriter_Dealloc(&ctx.writer);
15427 if (ctx.args_owned) {
15428 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015429 }
15430 return NULL;
15431}
15432
Jeremy Hylton938ace62002-07-17 16:30:39 +000015433static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015434unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15435
Tim Peters6d6c1a32001-08-02 04:15:00 +000015436static PyObject *
15437unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15438{
Benjamin Peterson29060642009-01-31 22:14:21 +000015439 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 static char *kwlist[] = {"object", "encoding", "errors", 0};
15441 char *encoding = NULL;
15442 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015443
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 if (type != &PyUnicode_Type)
15445 return unicode_subtype_new(type, args, kwds);
15446 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015447 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015448 return NULL;
15449 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015450 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015451 if (encoding == NULL && errors == NULL)
15452 return PyObject_Str(x);
15453 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015454 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015455}
15456
Guido van Rossume023fe02001-08-30 03:12:59 +000015457static PyObject *
15458unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15459{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015460 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015461 Py_ssize_t length, char_size;
15462 int share_wstr, share_utf8;
15463 unsigned int kind;
15464 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015465
Benjamin Peterson14339b62009-01-31 16:36:08 +000015466 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015467
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015468 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015469 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015470 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015471 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015472 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015473 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015474 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015475 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015476
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015477 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015478 if (self == NULL) {
15479 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015480 return NULL;
15481 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015482 kind = PyUnicode_KIND(unicode);
15483 length = PyUnicode_GET_LENGTH(unicode);
15484
15485 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015486#ifdef Py_DEBUG
15487 _PyUnicode_HASH(self) = -1;
15488#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015489 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015490#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015491 _PyUnicode_STATE(self).interned = 0;
15492 _PyUnicode_STATE(self).kind = kind;
15493 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015494 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015495 _PyUnicode_STATE(self).ready = 1;
15496 _PyUnicode_WSTR(self) = NULL;
15497 _PyUnicode_UTF8_LENGTH(self) = 0;
15498 _PyUnicode_UTF8(self) = NULL;
15499 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015500 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015501
15502 share_utf8 = 0;
15503 share_wstr = 0;
15504 if (kind == PyUnicode_1BYTE_KIND) {
15505 char_size = 1;
15506 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15507 share_utf8 = 1;
15508 }
15509 else if (kind == PyUnicode_2BYTE_KIND) {
15510 char_size = 2;
15511 if (sizeof(wchar_t) == 2)
15512 share_wstr = 1;
15513 }
15514 else {
15515 assert(kind == PyUnicode_4BYTE_KIND);
15516 char_size = 4;
15517 if (sizeof(wchar_t) == 4)
15518 share_wstr = 1;
15519 }
15520
15521 /* Ensure we won't overflow the length. */
15522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15523 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015524 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015526 data = PyObject_MALLOC((length + 1) * char_size);
15527 if (data == NULL) {
15528 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015529 goto onError;
15530 }
15531
Victor Stinnerc3c74152011-10-02 20:39:55 +020015532 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015533 if (share_utf8) {
15534 _PyUnicode_UTF8_LENGTH(self) = length;
15535 _PyUnicode_UTF8(self) = data;
15536 }
15537 if (share_wstr) {
15538 _PyUnicode_WSTR_LENGTH(self) = length;
15539 _PyUnicode_WSTR(self) = (wchar_t *)data;
15540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015541
Christian Heimesf051e432016-09-13 20:22:02 +020015542 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015543 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015544 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015545#ifdef Py_DEBUG
15546 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15547#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015548 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015549 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015550
15551onError:
15552 Py_DECREF(unicode);
15553 Py_DECREF(self);
15554 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015555}
15556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015557PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015558"str(object='') -> str\n\
15559str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015560\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015561Create a new string object from the given object. If encoding or\n\
15562errors is specified, then the object must expose a data buffer\n\
15563that will be decoded using the given encoding and error handler.\n\
15564Otherwise, returns the result of object.__str__() (if defined)\n\
15565or repr(object).\n\
15566encoding defaults to sys.getdefaultencoding().\n\
15567errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015568
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015569static PyObject *unicode_iter(PyObject *seq);
15570
Guido van Rossumd57fd912000-03-10 22:53:23 +000015571PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015572 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015573 "str", /* tp_name */
15574 sizeof(PyUnicodeObject), /* tp_basicsize */
15575 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015576 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015577 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015578 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015579 0, /* tp_getattr */
15580 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015581 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015582 unicode_repr, /* tp_repr */
15583 &unicode_as_number, /* tp_as_number */
15584 &unicode_as_sequence, /* tp_as_sequence */
15585 &unicode_as_mapping, /* tp_as_mapping */
15586 (hashfunc) unicode_hash, /* tp_hash*/
15587 0, /* tp_call*/
15588 (reprfunc) unicode_str, /* tp_str */
15589 PyObject_GenericGetAttr, /* tp_getattro */
15590 0, /* tp_setattro */
15591 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015592 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015593 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15594 unicode_doc, /* tp_doc */
15595 0, /* tp_traverse */
15596 0, /* tp_clear */
15597 PyUnicode_RichCompare, /* tp_richcompare */
15598 0, /* tp_weaklistoffset */
15599 unicode_iter, /* tp_iter */
15600 0, /* tp_iternext */
15601 unicode_methods, /* tp_methods */
15602 0, /* tp_members */
15603 0, /* tp_getset */
15604 &PyBaseObject_Type, /* tp_base */
15605 0, /* tp_dict */
15606 0, /* tp_descr_get */
15607 0, /* tp_descr_set */
15608 0, /* tp_dictoffset */
15609 0, /* tp_init */
15610 0, /* tp_alloc */
15611 unicode_new, /* tp_new */
15612 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015613};
15614
15615/* Initialize the Unicode implementation */
15616
Victor Stinner331a6a52019-05-27 16:39:22 +020015617PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015618_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015619{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015620 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015621 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015622 0x000A, /* LINE FEED */
15623 0x000D, /* CARRIAGE RETURN */
15624 0x001C, /* FILE SEPARATOR */
15625 0x001D, /* GROUP SEPARATOR */
15626 0x001E, /* RECORD SEPARATOR */
15627 0x0085, /* NEXT LINE */
15628 0x2028, /* LINE SEPARATOR */
15629 0x2029, /* PARAGRAPH SEPARATOR */
15630 };
15631
Victor Stinner91698d82020-06-25 14:07:40 +020015632 struct _Py_unicode_state *state = &tstate->interp->unicode;
15633 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015634 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015635 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015636
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015637 if (_Py_IsMainInterpreter(tstate)) {
15638 /* initialize the linebreak bloom filter */
15639 bloom_linebreak = make_bloom_mask(
15640 PyUnicode_2BYTE_KIND, linebreak,
15641 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015642
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015643 if (PyType_Ready(&PyUnicode_Type) < 0) {
15644 return _PyStatus_ERR("Can't initialize unicode type");
15645 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015646
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015647 if (PyType_Ready(&EncodingMapType) < 0) {
15648 return _PyStatus_ERR("Can't initialize encoding map type");
15649 }
15650 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15651 return _PyStatus_ERR("Can't initialize field name iterator type");
15652 }
15653 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15654 return _PyStatus_ERR("Can't initialize formatter iter type");
15655 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015656 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015657 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015658}
15659
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015660
Walter Dörwald16807132007-05-25 13:52:07 +000015661void
15662PyUnicode_InternInPlace(PyObject **p)
15663{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015664 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015665#ifdef Py_DEBUG
15666 assert(s != NULL);
15667 assert(_PyUnicode_CHECK(s));
15668#else
Victor Stinner607b1022020-05-05 18:50:30 +020015669 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015670 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015671 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015672#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015673
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 /* If it's a subclass, we don't really know what putting
15675 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015676 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015677 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015678 }
15679
15680 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015681 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015682 }
15683
15684#ifdef INTERNED_STRINGS
Victor Stinner666ecfb2020-07-02 01:19:57 +020015685 if (PyUnicode_READY(s) == -1) {
15686 PyErr_Clear();
15687 return;
15688 }
15689
Benjamin Peterson14339b62009-01-31 16:36:08 +000015690 if (interned == NULL) {
15691 interned = PyDict_New();
15692 if (interned == NULL) {
15693 PyErr_Clear(); /* Don't leave an exception */
15694 return;
15695 }
15696 }
Victor Stinner607b1022020-05-05 18:50:30 +020015697
15698 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015699 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015700 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015701 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015702
Berker Peksagced8d4c2016-07-25 04:40:39 +030015703 if (t == NULL) {
15704 PyErr_Clear();
15705 return;
15706 }
Victor Stinner607b1022020-05-05 18:50:30 +020015707
Berker Peksagced8d4c2016-07-25 04:40:39 +030015708 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015709 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015710 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015711 return;
15712 }
Victor Stinner607b1022020-05-05 18:50:30 +020015713
Victor Stinner3549ca32020-07-03 16:59:12 +020015714 /* The two references in interned dict (key and value) are not counted by
15715 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15716 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015717 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015718 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015719#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015720}
15721
15722void
15723PyUnicode_InternImmortal(PyObject **p)
15724{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015725 PyUnicode_InternInPlace(p);
15726 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015727 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015728 Py_INCREF(*p);
15729 }
Walter Dörwald16807132007-05-25 13:52:07 +000015730}
15731
15732PyObject *
15733PyUnicode_InternFromString(const char *cp)
15734{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015735 PyObject *s = PyUnicode_FromString(cp);
15736 if (s == NULL)
15737 return NULL;
15738 PyUnicode_InternInPlace(&s);
15739 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015740}
15741
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015742
Victor Stinner666ecfb2020-07-02 01:19:57 +020015743void
15744_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015745{
Victor Stinner666ecfb2020-07-02 01:19:57 +020015746 if (!_Py_IsMainInterpreter(tstate)) {
15747 // interned dict is shared by all interpreters
Benjamin Peterson14339b62009-01-31 16:36:08 +000015748 return;
15749 }
Walter Dörwald16807132007-05-25 13:52:07 +000015750
Victor Stinner666ecfb2020-07-02 01:19:57 +020015751 if (interned == NULL) {
15752 return;
15753 }
15754 assert(PyDict_CheckExact(interned));
15755
15756 PyObject *keys = PyDict_Keys(interned);
15757 if (keys == NULL) {
15758 PyErr_Clear();
15759 return;
15760 }
15761 assert(PyList_CheckExact(keys));
15762
15763 /* Interned unicode strings are not forcibly deallocated; rather, we give
15764 them their stolen references back, and then clear and DECREF the
15765 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015766
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015767 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015768#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015769 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015770
15771 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015772#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015773 for (Py_ssize_t i = 0; i < n; i++) {
15774 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015775 assert(PyUnicode_IS_READY(s));
15776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015777 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015778 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015779 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015780#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015781 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015782#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015783 break;
15784 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015785 // Restore the two references (key and value) ignored
15786 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015787 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015788#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015789 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015791 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015792 case SSTATE_NOT_INTERNED:
15793 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015794 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015795 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015797 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015798 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015799#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015800 fprintf(stderr,
15801 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15802 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015803#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015804 Py_DECREF(keys);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015805
Benjamin Peterson14339b62009-01-31 16:36:08 +000015806 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015807 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015808}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015809
15810
15811/********************* Unicode Iterator **************************/
15812
15813typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015814 PyObject_HEAD
15815 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015816 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015817} unicodeiterobject;
15818
15819static void
15820unicodeiter_dealloc(unicodeiterobject *it)
15821{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015822 _PyObject_GC_UNTRACK(it);
15823 Py_XDECREF(it->it_seq);
15824 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015825}
15826
15827static int
15828unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15829{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015830 Py_VISIT(it->it_seq);
15831 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015832}
15833
15834static PyObject *
15835unicodeiter_next(unicodeiterobject *it)
15836{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015837 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015838
Benjamin Peterson14339b62009-01-31 16:36:08 +000015839 assert(it != NULL);
15840 seq = it->it_seq;
15841 if (seq == NULL)
15842 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015843 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015845 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15846 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015847 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015848 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15849 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015850 if (item != NULL)
15851 ++it->it_index;
15852 return item;
15853 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015854
Benjamin Peterson14339b62009-01-31 16:36:08 +000015855 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015856 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015857 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015858}
15859
15860static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015861unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015862{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015863 Py_ssize_t len = 0;
15864 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015865 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015866 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015867}
15868
15869PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15870
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015871static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015872unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015873{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015874 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015875 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015876 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015877 it->it_seq, it->it_index);
15878 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015879 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015880 if (u == NULL)
15881 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015882 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015883 }
15884}
15885
15886PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15887
15888static PyObject *
15889unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15890{
15891 Py_ssize_t index = PyLong_AsSsize_t(state);
15892 if (index == -1 && PyErr_Occurred())
15893 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015894 if (it->it_seq != NULL) {
15895 if (index < 0)
15896 index = 0;
15897 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15898 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15899 it->it_index = index;
15900 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015901 Py_RETURN_NONE;
15902}
15903
15904PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15905
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015906static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015907 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015908 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015909 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15910 reduce_doc},
15911 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15912 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015913 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015914};
15915
15916PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015917 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15918 "str_iterator", /* tp_name */
15919 sizeof(unicodeiterobject), /* tp_basicsize */
15920 0, /* tp_itemsize */
15921 /* methods */
15922 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015923 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015924 0, /* tp_getattr */
15925 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015926 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015927 0, /* tp_repr */
15928 0, /* tp_as_number */
15929 0, /* tp_as_sequence */
15930 0, /* tp_as_mapping */
15931 0, /* tp_hash */
15932 0, /* tp_call */
15933 0, /* tp_str */
15934 PyObject_GenericGetAttr, /* tp_getattro */
15935 0, /* tp_setattro */
15936 0, /* tp_as_buffer */
15937 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15938 0, /* tp_doc */
15939 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15940 0, /* tp_clear */
15941 0, /* tp_richcompare */
15942 0, /* tp_weaklistoffset */
15943 PyObject_SelfIter, /* tp_iter */
15944 (iternextfunc)unicodeiter_next, /* tp_iternext */
15945 unicodeiter_methods, /* tp_methods */
15946 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015947};
15948
15949static PyObject *
15950unicode_iter(PyObject *seq)
15951{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015952 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015953
Benjamin Peterson14339b62009-01-31 16:36:08 +000015954 if (!PyUnicode_Check(seq)) {
15955 PyErr_BadInternalCall();
15956 return NULL;
15957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015958 if (PyUnicode_READY(seq) == -1)
15959 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015960 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15961 if (it == NULL)
15962 return NULL;
15963 it->it_index = 0;
15964 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015965 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015966 _PyObject_GC_TRACK(it);
15967 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015968}
15969
Victor Stinner709d23d2019-05-02 14:56:30 -040015970static int
15971encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015972{
Victor Stinner709d23d2019-05-02 14:56:30 -040015973 int res;
15974 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15975 if (res == -2) {
15976 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15977 return -1;
15978 }
15979 if (res < 0) {
15980 PyErr_NoMemory();
15981 return -1;
15982 }
15983 return 0;
15984}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015985
Victor Stinner709d23d2019-05-02 14:56:30 -040015986
15987static int
15988config_get_codec_name(wchar_t **config_encoding)
15989{
15990 char *encoding;
15991 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15992 return -1;
15993 }
15994
15995 PyObject *name_obj = NULL;
15996 PyObject *codec = _PyCodec_Lookup(encoding);
15997 PyMem_RawFree(encoding);
15998
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015999 if (!codec)
16000 goto error;
16001
16002 name_obj = PyObject_GetAttrString(codec, "name");
16003 Py_CLEAR(codec);
16004 if (!name_obj) {
16005 goto error;
16006 }
16007
Victor Stinner709d23d2019-05-02 14:56:30 -040016008 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16009 Py_DECREF(name_obj);
16010 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016011 goto error;
16012 }
16013
Victor Stinner709d23d2019-05-02 14:56:30 -040016014 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16015 if (raw_wname == NULL) {
16016 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016017 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016018 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016019 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016020
16021 PyMem_RawFree(*config_encoding);
16022 *config_encoding = raw_wname;
16023
16024 PyMem_Free(wname);
16025 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016026
16027error:
16028 Py_XDECREF(codec);
16029 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016030 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016031}
16032
16033
Victor Stinner331a6a52019-05-27 16:39:22 +020016034static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016035init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016036{
Victor Stinner709d23d2019-05-02 14:56:30 -040016037 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016038 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016039 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016040 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016041 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016042 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016043 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016044}
16045
16046
Victor Stinner709d23d2019-05-02 14:56:30 -040016047static int
16048init_fs_codec(PyInterpreterState *interp)
16049{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016050 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016051
16052 _Py_error_handler error_handler;
16053 error_handler = get_error_handler_wide(config->filesystem_errors);
16054 if (error_handler == _Py_ERROR_UNKNOWN) {
16055 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16056 return -1;
16057 }
16058
16059 char *encoding, *errors;
16060 if (encode_wstr_utf8(config->filesystem_encoding,
16061 &encoding,
16062 "filesystem_encoding") < 0) {
16063 return -1;
16064 }
16065
16066 if (encode_wstr_utf8(config->filesystem_errors,
16067 &errors,
16068 "filesystem_errors") < 0) {
16069 PyMem_RawFree(encoding);
16070 return -1;
16071 }
16072
Victor Stinner3d17c042020-05-14 01:48:38 +020016073 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16074 PyMem_RawFree(fs_codec->encoding);
16075 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016076 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016077 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16078 PyMem_RawFree(fs_codec->errors);
16079 fs_codec->errors = errors;
16080 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016081
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016082#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016083 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016084#endif
16085
Victor Stinner709d23d2019-05-02 14:56:30 -040016086 /* At this point, PyUnicode_EncodeFSDefault() and
16087 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16088 the C implementation of the filesystem encoding. */
16089
16090 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16091 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016092 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16093 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016094 PyErr_NoMemory();
16095 return -1;
16096 }
16097 return 0;
16098}
16099
16100
Victor Stinner331a6a52019-05-27 16:39:22 +020016101static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016102init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016103{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016104 PyInterpreterState *interp = tstate->interp;
16105
Victor Stinner709d23d2019-05-02 14:56:30 -040016106 /* Update the filesystem encoding to the normalized Python codec name.
16107 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16108 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016109 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016110 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016111 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016112 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016113 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016114 }
16115
Victor Stinner709d23d2019-05-02 14:56:30 -040016116 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016117 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016118 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016119 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016120}
16121
16122
Victor Stinner331a6a52019-05-27 16:39:22 +020016123PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016124_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016125{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016126 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016127 if (_PyStatus_EXCEPTION(status)) {
16128 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016129 }
16130
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016131 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016132}
16133
16134
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016135static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016136_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016137{
Victor Stinner3d17c042020-05-14 01:48:38 +020016138 PyMem_RawFree(fs_codec->encoding);
16139 fs_codec->encoding = NULL;
16140 fs_codec->utf8 = 0;
16141 PyMem_RawFree(fs_codec->errors);
16142 fs_codec->errors = NULL;
16143 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016144}
16145
16146
Victor Stinner709d23d2019-05-02 14:56:30 -040016147#ifdef MS_WINDOWS
16148int
16149_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16150{
Victor Stinner81a7be32020-04-14 15:14:01 +020016151 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016152 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016153
16154 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16155 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16156 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16157 if (encoding == NULL || errors == NULL) {
16158 PyMem_RawFree(encoding);
16159 PyMem_RawFree(errors);
16160 PyErr_NoMemory();
16161 return -1;
16162 }
16163
16164 PyMem_RawFree(config->filesystem_encoding);
16165 config->filesystem_encoding = encoding;
16166 PyMem_RawFree(config->filesystem_errors);
16167 config->filesystem_errors = errors;
16168
16169 return init_fs_codec(interp);
16170}
16171#endif
16172
16173
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016174void
Victor Stinner3d483342019-11-22 12:27:50 +010016175_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016176{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016177 // _PyUnicode_ClearInterned() must be called before
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016178
Victor Stinner666ecfb2020-07-02 01:19:57 +020016179 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016180
Victor Stinner91698d82020-06-25 14:07:40 +020016181 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016182
Victor Stinner2f9ada92020-06-24 02:22:21 +020016183 for (Py_ssize_t i = 0; i < 256; i++) {
16184 Py_CLEAR(state->latin1[i]);
16185 }
16186
Victor Stinner666ecfb2020-07-02 01:19:57 +020016187 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016188 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016189 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016190
Victor Stinner3d17c042020-05-14 01:48:38 +020016191 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016192}
16193
16194
Georg Brandl66c221e2010-10-14 07:04:07 +000016195/* A _string module, to export formatter_parser and formatter_field_name_split
16196 to the string.Formatter class implemented in Python. */
16197
16198static PyMethodDef _string_methods[] = {
16199 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16200 METH_O, PyDoc_STR("split the argument as a field name")},
16201 {"formatter_parser", (PyCFunction) formatter_parser,
16202 METH_O, PyDoc_STR("parse the argument as a format string")},
16203 {NULL, NULL}
16204};
16205
16206static struct PyModuleDef _string_module = {
16207 PyModuleDef_HEAD_INIT,
16208 "_string",
16209 PyDoc_STR("string helper module"),
16210 0,
16211 _string_methods,
16212 NULL,
16213 NULL,
16214 NULL,
16215 NULL
16216};
16217
16218PyMODINIT_FUNC
16219PyInit__string(void)
16220{
16221 return PyModule_Create(&_string_module);
16222}
16223
16224
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016225#ifdef __cplusplus
16226}
16227#endif