blob: db3f55e02b98b56c321e068d46c274754cb46736 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner91698d82020-06-25 14:07:40 +020044#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
Victor Stinnere5014be2020-04-14 17:52:15 +020046#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinner91698d82020-06-25 14:07:40 +020047#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
Victor Stinnere5014be2020-04-14 17:52:15 +020050#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner91698d82020-06-25 14:07:40 +020051#include "ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinnerfecc4f22019-03-19 14:20:29 +010058/* Uncomment to display statistics on interned strings at exit when
59 using Valgrind or Insecure++. */
60/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 }
844 return str;
845}
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* --- Bloom Filters ----------------------------------------------------- */
848
849/* stuff to implement simple "bloom filters" for Unicode characters.
850 to keep things simple, we use a single bitmask, using the least 5
851 bits from each unicode characters as the bit index. */
852
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200853/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854
Antoine Pitrouf068f942010-01-13 14:19:12 +0000855#if LONG_BIT >= 128
856#define BLOOM_WIDTH 128
857#elif LONG_BIT >= 64
858#define BLOOM_WIDTH 64
859#elif LONG_BIT >= 32
860#define BLOOM_WIDTH 32
861#else
862#error "LONG_BIT is smaller than 32"
863#endif
864
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865#define BLOOM_MASK unsigned long
866
Serhiy Storchaka05997252013-01-26 12:14:02 +0200867static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870
Benjamin Peterson29060642009-01-31 22:14:21 +0000871#define BLOOM_LINEBREAK(ch) \
872 ((ch) < 128U ? ascii_linebreak[(ch)] : \
873 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700875static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300876make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877{
Victor Stinnera85af502013-04-09 21:53:54 +0200878#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
879 do { \
880 TYPE *data = (TYPE *)PTR; \
881 TYPE *end = data + LEN; \
882 Py_UCS4 ch; \
883 for (; data != end; data++) { \
884 ch = *data; \
885 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
886 } \
887 break; \
888 } while (0)
889
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 /* calculate simple bloom-style bitmask for a given unicode string */
891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
894 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200895 switch (kind) {
896 case PyUnicode_1BYTE_KIND:
897 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
898 break;
899 case PyUnicode_2BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
901 break;
902 case PyUnicode_4BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
904 break;
905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700906 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200909
910#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911}
912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300913static int
914ensure_unicode(PyObject *obj)
915{
916 if (!PyUnicode_Check(obj)) {
917 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200918 "must be str, not %.100s",
919 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920 return -1;
921 }
922 return PyUnicode_READY(obj);
923}
924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925/* Compilation of templated routines */
926
Victor Stinner90ed8a62020-06-24 00:34:07 +0200927#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/asciilib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs1lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
948#include "stringlib/ucs2lib.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/partition.h"
951#include "stringlib/split.h"
952#include "stringlib/count.h"
953#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300954#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200956#include "stringlib/undef.h"
957
958#include "stringlib/ucs4lib.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/partition.h"
961#include "stringlib/split.h"
962#include "stringlib/count.h"
963#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300964#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200965#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200966#include "stringlib/undef.h"
967
Inada Naoki2c4928d2020-06-17 20:09:44 +0900968_Py_COMP_DIAG_PUSH
969_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970#include "stringlib/unicodedefs.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/count.h"
973#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100974#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900975_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200977#undef STRINGLIB_GET_EMPTY
978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001227 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001228 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 }
1230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001231 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001232 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001233 return (PyUnicodeObject *)PyErr_NoMemory();
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 if (length < 0) {
1236 PyErr_SetString(PyExc_SystemError,
1237 "Negative size passed to _PyUnicode_New");
1238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1242 if (unicode == NULL)
1243 return NULL;
1244 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001245
1246 _PyUnicode_WSTR_LENGTH(unicode) = length;
1247 _PyUnicode_HASH(unicode) = -1;
1248 _PyUnicode_STATE(unicode).interned = 0;
1249 _PyUnicode_STATE(unicode).kind = 0;
1250 _PyUnicode_STATE(unicode).compact = 0;
1251 _PyUnicode_STATE(unicode).ready = 0;
1252 _PyUnicode_STATE(unicode).ascii = 0;
1253 _PyUnicode_DATA_ANY(unicode) = NULL;
1254 _PyUnicode_LENGTH(unicode) = 0;
1255 _PyUnicode_UTF8(unicode) = NULL;
1256 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1259 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001260 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001261 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001262 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264
Jeremy Hyltond8082792003-09-16 19:41:39 +00001265 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001266 * the caller fails before initializing str -- unicode_resize()
1267 * reads str[0], and the Keep-Alive optimization can keep memory
1268 * allocated for str alive across a call to unicode_dealloc(unicode).
1269 * We don't want unicode_resize to read uninitialized memory in
1270 * that case.
1271 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode)[0] = 0;
1273 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001274
Victor Stinner7931d9a2011-11-04 00:22:48 +01001275 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 return unicode;
1277}
1278
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279static const char*
1280unicode_kind_name(PyObject *unicode)
1281{
Victor Stinner42dfd712011-10-03 14:41:45 +02001282 /* don't check consistency: unicode_kind_name() is called from
1283 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 if (!PyUnicode_IS_COMPACT(unicode))
1285 {
1286 if (!PyUnicode_IS_READY(unicode))
1287 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 {
1290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "legacy ascii";
1293 else
1294 return "legacy latin1";
1295 case PyUnicode_2BYTE_KIND:
1296 return "legacy UCS2";
1297 case PyUnicode_4BYTE_KIND:
1298 return "legacy UCS4";
1299 default:
1300 return "<legacy invalid kind>";
1301 }
1302 }
1303 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001304 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001306 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 return "ascii";
1308 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 default:
1315 return "<invalid compact kind>";
1316 }
1317}
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001321const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001322 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return _PyUnicode_COMPACT_DATA(unicode);
1329}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001332 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1334 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1335 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1336 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1337 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1338 return PyUnicode_DATA(unicode);
1339}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001340
1341void
1342_PyUnicode_Dump(PyObject *op)
1343{
1344 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1346 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001347 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001350 {
1351 if (ascii->state.ascii)
1352 data = (ascii + 1);
1353 else
1354 data = (compact + 1);
1355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 else
1357 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001358 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->wstr == data)
1361 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001362 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera3b334d2011-10-03 13:53:37 +02001364 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001365 printf(" (%zu), ", compact->wstr_length);
1366 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 }
1369 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001370 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
1374
Victor Stinner91698d82020-06-25 14:07:40 +02001375static int
1376unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1377{
1378 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1379 // optimized to always use state->empty_string without having to check if
1380 // it is NULL or not.
1381 PyObject *empty = PyUnicode_New(1, 0);
1382 if (empty == NULL) {
1383 return -1;
1384 }
1385 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1386 _PyUnicode_LENGTH(empty) = 0;
1387 assert(_PyUnicode_CheckConsistency(empty, 1));
1388
1389 assert(state->empty_string == NULL);
1390 state->empty_string = empty;
1391 return 0;
1392}
1393
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyObject *
1396PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1397{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001398 /* Optimization for empty strings */
1399 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001400 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 }
1402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *obj;
1404 PyCompactUnicodeObject *unicode;
1405 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001406 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001407 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 Py_ssize_t char_size;
1409 Py_ssize_t struct_size;
1410
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 is_ascii = 0;
1412 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 struct_size = sizeof(PyCompactUnicodeObject);
1414 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001415 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 char_size = 1;
1417 is_ascii = 1;
1418 struct_size = sizeof(PyASCIIObject);
1419 }
1420 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001421 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 char_size = 1;
1423 }
1424 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 2;
1427 if (sizeof(wchar_t) == 2)
1428 is_sharing = 1;
1429 }
1430 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001431 if (maxchar > MAX_UNICODE) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "invalid maximum character passed to PyUnicode_New");
1434 return NULL;
1435 }
Victor Stinner8f825062012-04-27 13:55:39 +02001436 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 char_size = 4;
1438 if (sizeof(wchar_t) == 4)
1439 is_sharing = 1;
1440 }
1441
1442 /* Ensure we won't overflow the size. */
1443 if (size < 0) {
1444 PyErr_SetString(PyExc_SystemError,
1445 "Negative size passed to PyUnicode_New");
1446 return NULL;
1447 }
1448 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1449 return PyErr_NoMemory();
1450
1451 /* Duplicated allocation code from _PyObject_New() instead of a call to
1452 * PyObject_New() so we are able to allocate space for the object and
1453 * it's data buffer.
1454 */
1455 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001456 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001458 }
1459 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 unicode = (PyCompactUnicodeObject *)obj;
1462 if (is_ascii)
1463 data = ((PyASCIIObject*)obj) + 1;
1464 else
1465 data = unicode + 1;
1466 _PyUnicode_LENGTH(unicode) = size;
1467 _PyUnicode_HASH(unicode) = -1;
1468 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 _PyUnicode_STATE(unicode).compact = 1;
1471 _PyUnicode_STATE(unicode).ready = 1;
1472 _PyUnicode_STATE(unicode).ascii = is_ascii;
1473 if (is_ascii) {
1474 ((char*)data)[size] = 0;
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 }
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001482 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else {
1485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001489 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 ((Py_UCS4*)data)[size] = 0;
1491 if (is_sharing) {
1492 _PyUnicode_WSTR_LENGTH(unicode) = size;
1493 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1494 }
1495 else {
1496 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 }
1499 }
Victor Stinner8f825062012-04-27 13:55:39 +02001500#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001501 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001502#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return obj;
1505}
1506
1507#if SIZEOF_WCHAR_T == 2
1508/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1509 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001510 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 This function assumes that unicode can hold one more code point than wstr
1513 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001514static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
1518 const wchar_t *iter;
1519 Py_UCS4 *ucs4_out;
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 assert(unicode != NULL);
1522 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1524 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1525
1526 for (iter = begin; iter < end; ) {
1527 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1528 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001529 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1530 && (iter+1) < end
1531 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 {
Victor Stinner551ac952011-11-29 22:58:13 +01001533 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 iter += 2;
1535 }
1536 else {
1537 *ucs4_out++ = *iter;
1538 iter++;
1539 }
1540 }
1541 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
1543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544}
1545#endif
1546
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547static int
Victor Stinner488fa492011-12-12 00:01:39 +01001548unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001549{
Victor Stinner488fa492011-12-12 00:01:39 +01001550 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001551 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001552 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553 return -1;
1554 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001555 return 0;
1556}
1557
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558static int
1559_copy_characters(PyObject *to, Py_ssize_t to_start,
1560 PyObject *from, Py_ssize_t from_start,
1561 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001563 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001564 const void *from_data;
1565 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinneree4544c2012-05-09 22:24:08 +02001567 assert(0 <= how_many);
1568 assert(0 <= from_start);
1569 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerd3f08822012-05-29 12:57:52 +02001574 assert(PyUnicode_Check(to));
1575 assert(PyUnicode_IS_READY(to));
1576 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1577
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 if (how_many == 0)
1579 return 0;
1580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585
Victor Stinnerf1852262012-06-16 16:38:26 +02001586#ifdef Py_DEBUG
1587 if (!check_maxchar
1588 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1589 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 Py_UCS4 ch;
1592 Py_ssize_t i;
1593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1595 assert(ch <= to_maxchar);
1596 }
1597 }
1598#endif
1599
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001600 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (check_maxchar
1602 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1603 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 /* Writing Latin-1 characters into an ASCII string requires to
1605 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 Py_UCS4 max_char;
1607 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001608 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 if (max_char >= 128)
1610 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001611 }
Christian Heimesf051e432016-09-13 20:22:02 +02001612 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001614 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001616 else if (from_kind == PyUnicode_1BYTE_KIND
1617 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS1, Py_UCS2,
1621 PyUnicode_1BYTE_DATA(from) + from_start,
1622 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_2BYTE_DATA(to) + to_start
1624 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001625 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001626 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 && to_kind == PyUnicode_4BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS1, Py_UCS4,
1631 PyUnicode_1BYTE_DATA(from) + from_start,
1632 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_4BYTE_DATA(to) + to_start
1634 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001635 }
1636 else if (from_kind == PyUnicode_2BYTE_KIND
1637 && to_kind == PyUnicode_4BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS2, Py_UCS4,
1641 PyUnicode_2BYTE_DATA(from) + from_start,
1642 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_4BYTE_DATA(to) + to_start
1644 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001645 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1648
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 if (!check_maxchar) {
1650 if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_1BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS1,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_1BYTE_DATA(to) + to_start
1658 );
1659 }
1660 else if (from_kind == PyUnicode_4BYTE_KIND
1661 && to_kind == PyUnicode_1BYTE_KIND)
1662 {
1663 _PyUnicode_CONVERT_BYTES(
1664 Py_UCS4, Py_UCS1,
1665 PyUnicode_4BYTE_DATA(from) + from_start,
1666 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1667 PyUnicode_1BYTE_DATA(to) + to_start
1668 );
1669 }
1670 else if (from_kind == PyUnicode_4BYTE_KIND
1671 && to_kind == PyUnicode_2BYTE_KIND)
1672 {
1673 _PyUnicode_CONVERT_BYTES(
1674 Py_UCS4, Py_UCS2,
1675 PyUnicode_4BYTE_DATA(from) + from_start,
1676 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1677 PyUnicode_2BYTE_DATA(to) + to_start
1678 );
1679 }
1680 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001681 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001682 }
1683 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001684 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 Py_ssize_t i;
1688
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 for (i=0; i < how_many; i++) {
1690 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001691 if (ch > to_maxchar)
1692 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1694 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001695 }
1696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 return 0;
1698}
1699
Victor Stinnerd3f08822012-05-29 12:57:52 +02001700void
1701_PyUnicode_FastCopyCharacters(
1702 PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704{
1705 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1706}
1707
1708Py_ssize_t
1709PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1710 PyObject *from, Py_ssize_t from_start,
1711 Py_ssize_t how_many)
1712{
1713 int err;
1714
1715 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1716 PyErr_BadInternalCall();
1717 return -1;
1718 }
1719
Benjamin Petersonbac79492012-01-14 13:34:47 -05001720 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001722 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001723 return -1;
1724
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001726 PyErr_SetString(PyExc_IndexError, "string index out of range");
1727 return -1;
1728 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if (how_many < 0) {
1734 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1735 return -1;
1736 }
1737 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1739 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001740 "Cannot write %zi characters at %zi "
1741 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 how_many, to_start, PyUnicode_GET_LENGTH(to));
1743 return -1;
1744 }
1745
1746 if (how_many == 0)
1747 return 0;
1748
Victor Stinner488fa492011-12-12 00:01:39 +01001749 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001750 return -1;
1751
1752 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1753 if (err) {
1754 PyErr_Format(PyExc_SystemError,
1755 "Cannot copy %s characters "
1756 "into a string of %s characters",
1757 unicode_kind_name(from),
1758 unicode_kind_name(to));
1759 return -1;
1760 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001761 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Victor Stinner17222162011-09-28 22:15:37 +02001764/* Find the maximum code point and count the number of surrogate pairs so a
1765 correct string length can be computed before converting a string to UCS4.
1766 This function counts single surrogates as a character and not as a pair.
1767
1768 Return 0 on success, or -1 on error. */
1769static int
1770find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1771 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772{
1773 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 *num_surrogates = 0;
1778 *maxchar = 0;
1779
1780 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001782 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1783 && (iter+1) < end
1784 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1785 {
1786 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1787 ++(*num_surrogates);
1788 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 {
1793 ch = *iter;
1794 iter++;
1795 }
1796 if (ch > *maxchar) {
1797 *maxchar = ch;
1798 if (*maxchar > MAX_UNICODE) {
1799 PyErr_Format(PyExc_ValueError,
1800 "character U+%x is not in range [U+0000; U+10ffff]",
1801 ch);
1802 return -1;
1803 }
1804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 return 0;
1807}
1808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809int
1810_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811{
1812 wchar_t *end;
1813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815#if SIZEOF_WCHAR_T == 2
1816 Py_ssize_t length_wo_surrogates;
1817#endif
1818
Georg Brandl7597add2011-10-05 16:36:47 +02001819 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 strings were created using _PyObject_New() and where no canonical
1821 representation (the str field) has been set yet aka strings
1822 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001823 assert(_PyUnicode_CHECK(unicode));
1824 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001827 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001828 /* Actually, it should neither be interned nor be anything else: */
1829 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001832 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
1836 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1838 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 PyErr_NoMemory();
1840 return -1;
1841 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001842 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 _PyUnicode_WSTR(unicode), end,
1844 PyUnicode_1BYTE_DATA(unicode));
1845 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1847 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1848 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001849 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001850 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001851 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 }
1853 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861 }
1862 /* In this case we might have to convert down from 4-byte native
1863 wchar_t to 2-byte unicode. */
1864 else if (maxchar < 65536) {
1865 assert(num_surrogates == 0 &&
1866 "FindMaxCharAndNumSurrogatePairs() messed up");
1867
Victor Stinner506f5922011-09-28 22:34:18 +02001868#if SIZEOF_WCHAR_T == 2
1869 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876#else
1877 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001878 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001879 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001881 PyErr_NoMemory();
1882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinner506f5922011-09-28 22:34:18 +02001884 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1885 _PyUnicode_WSTR(unicode), end,
1886 PyUnicode_2BYTE_DATA(unicode));
1887 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1888 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1889 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001890 _PyUnicode_UTF8(unicode) = NULL;
1891 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001892 PyObject_FREE(_PyUnicode_WSTR(unicode));
1893 _PyUnicode_WSTR(unicode) = NULL;
1894 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
1897 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1898 else {
1899#if SIZEOF_WCHAR_T == 2
1900 /* in case the native representation is 2-bytes, we need to allocate a
1901 new normalized 4-byte version. */
1902 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001903 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1904 PyErr_NoMemory();
1905 return -1;
1906 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1908 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 PyErr_NoMemory();
1910 return -1;
1911 }
1912 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001916 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001918 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 PyObject_FREE(_PyUnicode_WSTR(unicode));
1920 _PyUnicode_WSTR(unicode) = NULL;
1921 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1922#else
1923 assert(num_surrogates == 0);
1924
Victor Stinnerc3c74152011-10-02 20:39:55 +02001925 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001927 _PyUnicode_UTF8(unicode) = NULL;
1928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1930#endif
1931 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1932 }
1933 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return 0;
1936}
1937
Alexander Belopolsky40018472011-02-26 01:02:56 +00001938static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001939unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940{
Walter Dörwald16807132007-05-25 13:52:07 +00001941 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 case SSTATE_NOT_INTERNED:
1943 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_INTERNED_MORTAL:
1946 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001947 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001948#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001949 if (PyDict_DelItem(interned, unicode) != 0) {
1950 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1951 NULL);
1952 }
Victor Stinner607b1022020-05-05 18:50:30 +02001953#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001954 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1958 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001959
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001962 }
1963
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001966 }
1967 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001968 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 }
1970 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001971 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001974 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975}
1976
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977#ifdef Py_DEBUG
1978static int
1979unicode_is_singleton(PyObject *unicode)
1980{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001981 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001982 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001983 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001984 }
Victor Stinner607b1022020-05-05 18:50:30 +02001985 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001986 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1987 {
1988 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001989 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001991 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001992 }
1993 return 0;
1994}
1995#endif
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997static int
Victor Stinner488fa492011-12-12 00:01:39 +01001998unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999{
Victor Stinner488fa492011-12-12 00:01:39 +01002000 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 if (Py_REFCNT(unicode) != 1)
2002 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002003 if (_PyUnicode_HASH(unicode) != -1)
2004 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 if (PyUnicode_CHECK_INTERNED(unicode))
2006 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002007 if (!PyUnicode_CheckExact(unicode))
2008 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002009#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002010 /* singleton refcount is greater than 1 */
2011 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002012#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 return 1;
2014}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002015
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016static int
2017unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2018{
2019 PyObject *unicode;
2020 Py_ssize_t old_length;
2021
2022 assert(p_unicode != NULL);
2023 unicode = *p_unicode;
2024
2025 assert(unicode != NULL);
2026 assert(PyUnicode_Check(unicode));
2027 assert(0 <= length);
2028
Victor Stinner910337b2011-10-03 03:20:16 +02002029 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002030 old_length = PyUnicode_WSTR_LENGTH(unicode);
2031 else
2032 old_length = PyUnicode_GET_LENGTH(unicode);
2033 if (old_length == length)
2034 return 0;
2035
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002036 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002037 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002038 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002039 return 0;
2040 }
2041
Victor Stinner488fa492011-12-12 00:01:39 +01002042 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 PyObject *copy = resize_copy(unicode, length);
2044 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002046 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048 }
2049
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002051 PyObject *new_unicode = resize_compact(unicode, length);
2052 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002054 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002057 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002058}
2059
Alexander Belopolsky40018472011-02-26 01:02:56 +00002060int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002061PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002062{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 PyObject *unicode;
2064 if (p_unicode == NULL) {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002069 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 {
2071 PyErr_BadInternalCall();
2072 return -1;
2073 }
2074 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002075}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002076
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002077/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002078
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002079 WARNING: The function doesn't copy the terminating null character and
2080 doesn't check the maximum character (may write a latin1 character in an
2081 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002082static void
2083unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2084 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002085{
2086 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002087 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002090 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002091 switch (kind) {
2092 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002093#ifdef Py_DEBUG
2094 if (PyUnicode_IS_ASCII(unicode)) {
2095 Py_UCS4 maxchar = ucs1lib_find_max_char(
2096 (const Py_UCS1*)str,
2097 (const Py_UCS1*)str + len);
2098 assert(maxchar < 128);
2099 }
2100#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002101 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002102 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 }
2104 case PyUnicode_2BYTE_KIND: {
2105 Py_UCS2 *start = (Py_UCS2 *)data + index;
2106 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107
Victor Stinner184252a2012-06-16 02:57:41 +02002108 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 *ucs2 = (Py_UCS2)*str;
2110
2111 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002114 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 Py_UCS4 *start = (Py_UCS4 *)data + index;
2116 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117
Victor Stinner184252a2012-06-16 02:57:41 +02002118 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 *ucs4 = (Py_UCS4)*str;
2120
2121 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002122 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002124 default:
2125 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 }
2127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002130get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002132 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002133
Victor Stinner2f9ada92020-06-24 02:22:21 +02002134 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002135 if (unicode) {
2136 Py_INCREF(unicode);
2137 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 }
Victor Stinner607b1022020-05-05 18:50:30 +02002139
2140 unicode = PyUnicode_New(1, ch);
2141 if (!unicode) {
2142 return NULL;
2143 }
2144
2145 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2146 assert(_PyUnicode_CheckConsistency(unicode, 1));
2147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002149 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002150 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151}
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153static PyObject*
2154unicode_char(Py_UCS4 ch)
2155{
2156 PyObject *unicode;
2157
2158 assert(ch <= MAX_UNICODE);
2159
Victor Stinner2f9ada92020-06-24 02:22:21 +02002160 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002161 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002162 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002163
Victor Stinner985a82a2014-01-03 12:53:47 +01002164 unicode = PyUnicode_New(1, ch);
2165 if (unicode == NULL)
2166 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002167
2168 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2169 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002170 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002171 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002172 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2173 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2174 }
2175 assert(_PyUnicode_CheckConsistency(unicode, 1));
2176 return unicode;
2177}
2178
Alexander Belopolsky40018472011-02-26 01:02:56 +00002179PyObject *
2180PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002182 if (u == NULL)
2183 return (PyObject*)_PyUnicode_New(size);
2184
2185 if (size < 0) {
2186 PyErr_BadInternalCall();
2187 return NULL;
2188 }
2189
2190 return PyUnicode_FromWideChar(u, size);
2191}
2192
2193PyObject *
2194PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2195{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002196 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 Py_UCS4 maxchar = 0;
2198 Py_ssize_t num_surrogates;
2199
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002200 if (u == NULL && size != 0) {
2201 PyErr_BadInternalCall();
2202 return NULL;
2203 }
2204
2205 if (size == -1) {
2206 size = wcslen(u);
2207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002209 /* If the Unicode data is known at construction time, we can apply
2210 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002213 if (size == 0)
2214 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 /* Single character Unicode objects in the Latin-1 range are
2217 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002218 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 return get_latin1_char((unsigned char)*u);
2220
2221 /* If not empty and not single character, copy the Unicode data
2222 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002223 if (find_maxchar_surrogates(u, u + size,
2224 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return NULL;
2226
Victor Stinner8faf8212011-12-08 22:14:11 +01002227 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 if (!unicode)
2229 return NULL;
2230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 switch (PyUnicode_KIND(unicode)) {
2232 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002233 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2235 break;
2236 case PyUnicode_2BYTE_KIND:
2237#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002238 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002240 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2242#endif
2243 break;
2244 case PyUnicode_4BYTE_KIND:
2245#if SIZEOF_WCHAR_T == 2
2246 /* This is the only case which has to process surrogates, thus
2247 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002248 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249#else
2250 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002251 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252#endif
2253 break;
2254 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002255 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002258 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259}
2260
Alexander Belopolsky40018472011-02-26 01:02:56 +00002261PyObject *
2262PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002263{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002264 if (size < 0) {
2265 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002266 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 return NULL;
2268 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002269 if (u != NULL)
2270 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2271 else
2272 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002273}
2274
Alexander Belopolsky40018472011-02-26 01:02:56 +00002275PyObject *
2276PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002277{
2278 size_t size = strlen(u);
2279 if (size > PY_SSIZE_T_MAX) {
2280 PyErr_SetString(PyExc_OverflowError, "input too long");
2281 return NULL;
2282 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002283 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002284}
2285
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002286PyObject *
2287_PyUnicode_FromId(_Py_Identifier *id)
2288{
Victor Stinner297257f2020-06-02 14:39:45 +02002289 if (id->object) {
2290 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002291 }
Victor Stinner297257f2020-06-02 14:39:45 +02002292
2293 PyObject *obj;
2294 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2295 strlen(id->string),
2296 NULL, NULL);
2297 if (!obj) {
2298 return NULL;
2299 }
2300 PyUnicode_InternInPlace(&obj);
2301
2302 assert(!id->next);
2303 id->object = obj;
2304 id->next = static_strings;
2305 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002306 return id->object;
2307}
2308
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002309static void
2310unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002311{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002312 _Py_Identifier *tmp, *s = static_strings;
2313 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002314 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002315 tmp = s->next;
2316 s->next = NULL;
2317 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002318 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002319 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002320}
2321
Benjamin Peterson0df54292012-03-26 14:50:32 -04002322/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002323
Victor Stinnerd3f08822012-05-29 12:57:52 +02002324PyObject*
2325_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002326{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002327 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002328 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002329 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002330#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002331 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002332#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002333 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002334 }
Victor Stinner785938e2011-12-11 20:09:03 +01002335 unicode = PyUnicode_New(size, 127);
2336 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002337 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002338 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2339 assert(_PyUnicode_CheckConsistency(unicode, 1));
2340 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002341}
2342
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002343static Py_UCS4
2344kind_maxchar_limit(unsigned int kind)
2345{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002346 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002347 case PyUnicode_1BYTE_KIND:
2348 return 0x80;
2349 case PyUnicode_2BYTE_KIND:
2350 return 0x100;
2351 case PyUnicode_4BYTE_KIND:
2352 return 0x10000;
2353 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002354 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002355 }
2356}
2357
Victor Stinner702c7342011-10-05 13:50:52 +02002358static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002359_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002362 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002363
Victor Stinner2f9ada92020-06-24 02:22:21 +02002364 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002365 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002366 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002367 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002368 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002369 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002370 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002371
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002372 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002373 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 if (!res)
2375 return NULL;
2376 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002377 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002379}
2380
Victor Stinnere57b1c02011-09-28 22:20:48 +02002381static PyObject*
2382_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383{
2384 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002385 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002386
Serhiy Storchaka678db842013-01-26 12:16:36 +02002387 if (size == 0)
2388 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002389 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002390 if (size == 1)
2391 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002392
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002393 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002394 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 if (!res)
2396 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002399 else {
2400 _PyUnicode_CONVERT_BYTES(
2401 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002403 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 return res;
2405}
2406
Victor Stinnere57b1c02011-09-28 22:20:48 +02002407static PyObject*
2408_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409{
2410 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002411 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002412
Serhiy Storchaka678db842013-01-26 12:16:36 +02002413 if (size == 0)
2414 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002415 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002416 if (size == 1)
2417 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002418
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002419 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002420 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 if (!res)
2422 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002423 if (max_char < 256)
2424 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2425 PyUnicode_1BYTE_DATA(res));
2426 else if (max_char < 0x10000)
2427 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2428 PyUnicode_2BYTE_DATA(res));
2429 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002430 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002431 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 return res;
2433}
2434
2435PyObject*
2436PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2437{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002438 if (size < 0) {
2439 PyErr_SetString(PyExc_ValueError, "size must be positive");
2440 return NULL;
2441 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002442 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002444 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002446 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002448 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002449 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002450 PyErr_SetString(PyExc_SystemError, "invalid kind");
2451 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453}
2454
Victor Stinnerece58de2012-04-23 23:36:38 +02002455Py_UCS4
2456_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2457{
2458 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002459 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002460
2461 assert(PyUnicode_IS_READY(unicode));
2462 assert(0 <= start);
2463 assert(end <= PyUnicode_GET_LENGTH(unicode));
2464 assert(start <= end);
2465
2466 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2467 return PyUnicode_MAX_CHAR_VALUE(unicode);
2468
2469 if (start == end)
2470 return 127;
2471
Victor Stinner94d558b2012-04-27 22:26:58 +02002472 if (PyUnicode_IS_ASCII(unicode))
2473 return 127;
2474
Victor Stinnerece58de2012-04-23 23:36:38 +02002475 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002476 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002477 endptr = (char *)startptr + end * kind;
2478 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002479 switch(kind) {
2480 case PyUnicode_1BYTE_KIND:
2481 return ucs1lib_find_max_char(startptr, endptr);
2482 case PyUnicode_2BYTE_KIND:
2483 return ucs2lib_find_max_char(startptr, endptr);
2484 case PyUnicode_4BYTE_KIND:
2485 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002486 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002487 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002488 }
2489}
2490
Victor Stinner25a4b292011-10-06 12:31:55 +02002491/* Ensure that a string uses the most efficient storage, if it is not the
2492 case: create a new string with of the right kind. Write NULL into *p_unicode
2493 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002494static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002495unicode_adjust_maxchar(PyObject **p_unicode)
2496{
2497 PyObject *unicode, *copy;
2498 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002499 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002500 unsigned int kind;
2501
2502 assert(p_unicode != NULL);
2503 unicode = *p_unicode;
2504 assert(PyUnicode_IS_READY(unicode));
2505 if (PyUnicode_IS_ASCII(unicode))
2506 return;
2507
2508 len = PyUnicode_GET_LENGTH(unicode);
2509 kind = PyUnicode_KIND(unicode);
2510 if (kind == PyUnicode_1BYTE_KIND) {
2511 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002512 max_char = ucs1lib_find_max_char(u, u + len);
2513 if (max_char >= 128)
2514 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002515 }
2516 else if (kind == PyUnicode_2BYTE_KIND) {
2517 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002518 max_char = ucs2lib_find_max_char(u, u + len);
2519 if (max_char >= 256)
2520 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002521 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002522 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002523 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002524 max_char = ucs4lib_find_max_char(u, u + len);
2525 if (max_char >= 0x10000)
2526 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002527 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002528 else
2529 Py_UNREACHABLE();
2530
Victor Stinner25a4b292011-10-06 12:31:55 +02002531 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002532 if (copy != NULL)
2533 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002534 Py_DECREF(unicode);
2535 *p_unicode = copy;
2536}
2537
Victor Stinner034f6cf2011-09-30 02:26:44 +02002538PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002539_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002540{
Victor Stinner87af4f22011-11-21 23:03:47 +01002541 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002542 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002543
Victor Stinner034f6cf2011-09-30 02:26:44 +02002544 if (!PyUnicode_Check(unicode)) {
2545 PyErr_BadInternalCall();
2546 return NULL;
2547 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002548 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002549 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002550
Victor Stinner87af4f22011-11-21 23:03:47 +01002551 length = PyUnicode_GET_LENGTH(unicode);
2552 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002553 if (!copy)
2554 return NULL;
2555 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2556
Christian Heimesf051e432016-09-13 20:22:02 +02002557 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002558 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002559 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002560 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002561}
2562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563
Victor Stinnerbc603d12011-10-02 01:00:40 +02002564/* Widen Unicode objects to larger buffers. Don't write terminating null
2565 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002567static void*
2568unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002570 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002571
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002572 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002573 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002574 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002575 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002576 if (!result)
2577 return PyErr_NoMemory();
2578 assert(skind == PyUnicode_1BYTE_KIND);
2579 _PyUnicode_CONVERT_BYTES(
2580 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002581 (const Py_UCS1 *)data,
2582 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002583 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002585 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002586 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002587 if (!result)
2588 return PyErr_NoMemory();
2589 if (skind == PyUnicode_2BYTE_KIND) {
2590 _PyUnicode_CONVERT_BYTES(
2591 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002592 (const Py_UCS2 *)data,
2593 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002594 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002596 else {
2597 assert(skind == PyUnicode_1BYTE_KIND);
2598 _PyUnicode_CONVERT_BYTES(
2599 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002600 (const Py_UCS1 *)data,
2601 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002602 result);
2603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002605 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002606 Py_UNREACHABLE();
2607 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609}
2610
2611static Py_UCS4*
2612as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2613 int copy_null)
2614{
2615 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002616 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 Py_ssize_t len, targetlen;
2618 if (PyUnicode_READY(string) == -1)
2619 return NULL;
2620 kind = PyUnicode_KIND(string);
2621 data = PyUnicode_DATA(string);
2622 len = PyUnicode_GET_LENGTH(string);
2623 targetlen = len;
2624 if (copy_null)
2625 targetlen++;
2626 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002627 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 if (!target) {
2629 PyErr_NoMemory();
2630 return NULL;
2631 }
2632 }
2633 else {
2634 if (targetsize < targetlen) {
2635 PyErr_Format(PyExc_SystemError,
2636 "string is longer than the buffer");
2637 if (copy_null && 0 < targetsize)
2638 target[0] = 0;
2639 return NULL;
2640 }
2641 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002642 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002643 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002644 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002646 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002647 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002648 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2649 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002650 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002651 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002652 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002653 else {
2654 Py_UNREACHABLE();
2655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 if (copy_null)
2657 target[len] = 0;
2658 return target;
2659}
2660
2661Py_UCS4*
2662PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2663 int copy_null)
2664{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002665 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 PyErr_BadInternalCall();
2667 return NULL;
2668 }
2669 return as_ucs4(string, target, targetsize, copy_null);
2670}
2671
2672Py_UCS4*
2673PyUnicode_AsUCS4Copy(PyObject *string)
2674{
2675 return as_ucs4(string, NULL, 0, 1);
2676}
2677
Victor Stinner15a11362012-10-06 23:48:20 +02002678/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002679 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2680 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2681#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002682
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002683static int
2684unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2685 Py_ssize_t width, Py_ssize_t precision)
2686{
2687 Py_ssize_t length, fill, arglen;
2688 Py_UCS4 maxchar;
2689
2690 if (PyUnicode_READY(str) == -1)
2691 return -1;
2692
2693 length = PyUnicode_GET_LENGTH(str);
2694 if ((precision == -1 || precision >= length)
2695 && width <= length)
2696 return _PyUnicodeWriter_WriteStr(writer, str);
2697
2698 if (precision != -1)
2699 length = Py_MIN(precision, length);
2700
2701 arglen = Py_MAX(length, width);
2702 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2703 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2704 else
2705 maxchar = writer->maxchar;
2706
2707 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2708 return -1;
2709
2710 if (width > length) {
2711 fill = width - length;
2712 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2713 return -1;
2714 writer->pos += fill;
2715 }
2716
2717 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2718 str, 0, length);
2719 writer->pos += length;
2720 return 0;
2721}
2722
2723static int
Victor Stinner998b8062018-09-12 00:23:25 +02002724unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002725 Py_ssize_t width, Py_ssize_t precision)
2726{
2727 /* UTF-8 */
2728 Py_ssize_t length;
2729 PyObject *unicode;
2730 int res;
2731
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002732 if (precision == -1) {
2733 length = strlen(str);
2734 }
2735 else {
2736 length = 0;
2737 while (length < precision && str[length]) {
2738 length++;
2739 }
2740 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002741 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2742 if (unicode == NULL)
2743 return -1;
2744
2745 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2746 Py_DECREF(unicode);
2747 return res;
2748}
2749
Victor Stinner96865452011-03-01 23:44:09 +00002750static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002751unicode_fromformat_arg(_PyUnicodeWriter *writer,
2752 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002753{
Victor Stinnere215d962012-10-06 23:03:36 +02002754 const char *p;
2755 Py_ssize_t len;
2756 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002757 Py_ssize_t width;
2758 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002759 int longflag;
2760 int longlongflag;
2761 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002762 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002763
2764 p = f;
2765 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002766 zeropad = 0;
2767 if (*f == '0') {
2768 zeropad = 1;
2769 f++;
2770 }
Victor Stinner96865452011-03-01 23:44:09 +00002771
2772 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002773 width = -1;
2774 if (Py_ISDIGIT((unsigned)*f)) {
2775 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002776 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002777 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002778 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002779 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002781 return NULL;
2782 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002783 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002784 f++;
2785 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 }
2787 precision = -1;
2788 if (*f == '.') {
2789 f++;
2790 if (Py_ISDIGIT((unsigned)*f)) {
2791 precision = (*f - '0');
2792 f++;
2793 while (Py_ISDIGIT((unsigned)*f)) {
2794 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2795 PyErr_SetString(PyExc_ValueError,
2796 "precision too big");
2797 return NULL;
2798 }
2799 precision = (precision * 10) + (*f - '0');
2800 f++;
2801 }
2802 }
Victor Stinner96865452011-03-01 23:44:09 +00002803 if (*f == '%') {
2804 /* "%.3%s" => f points to "3" */
2805 f--;
2806 }
2807 }
2808 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002809 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002810 f--;
2811 }
Victor Stinner96865452011-03-01 23:44:09 +00002812
2813 /* Handle %ld, %lu, %lld and %llu. */
2814 longflag = 0;
2815 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002816 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002817 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002818 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002819 longflag = 1;
2820 ++f;
2821 }
Victor Stinner96865452011-03-01 23:44:09 +00002822 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002823 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002824 longlongflag = 1;
2825 f += 2;
2826 }
Victor Stinner96865452011-03-01 23:44:09 +00002827 }
2828 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002829 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002830 size_tflag = 1;
2831 ++f;
2832 }
Victor Stinnere215d962012-10-06 23:03:36 +02002833
2834 if (f[1] == '\0')
2835 writer->overallocate = 0;
2836
2837 switch (*f) {
2838 case 'c':
2839 {
2840 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002841 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002842 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002843 "character argument not in range(0x110000)");
2844 return NULL;
2845 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002846 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849 }
2850
2851 case 'i':
2852 case 'd':
2853 case 'u':
2854 case 'x':
2855 {
2856 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002857 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002858 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002859
2860 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002861 if (longflag) {
2862 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2863 }
2864 else if (longlongflag) {
2865 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2866 }
2867 else if (size_tflag) {
2868 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2869 }
2870 else {
2871 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2872 }
Victor Stinnere215d962012-10-06 23:03:36 +02002873 }
2874 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002875 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002876 }
2877 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002878 if (longflag) {
2879 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2880 }
2881 else if (longlongflag) {
2882 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2883 }
2884 else if (size_tflag) {
2885 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2886 }
2887 else {
2888 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2889 }
Victor Stinnere215d962012-10-06 23:03:36 +02002890 }
2891 assert(len >= 0);
2892
Victor Stinnere215d962012-10-06 23:03:36 +02002893 if (precision < len)
2894 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002895
2896 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002897 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2898 return NULL;
2899
Victor Stinnere215d962012-10-06 23:03:36 +02002900 if (width > precision) {
2901 Py_UCS4 fillchar;
2902 fill = width - precision;
2903 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002904 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2905 return NULL;
2906 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002907 }
Victor Stinner15a11362012-10-06 23:48:20 +02002908 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002909 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002910 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2911 return NULL;
2912 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002913 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914
Victor Stinner4a587072013-11-19 12:54:53 +01002915 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2916 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 break;
2918 }
2919
2920 case 'p':
2921 {
2922 char number[MAX_LONG_LONG_CHARS];
2923
2924 len = sprintf(number, "%p", va_arg(*vargs, void*));
2925 assert(len >= 0);
2926
2927 /* %p is ill-defined: ensure leading 0x. */
2928 if (number[1] == 'X')
2929 number[1] = 'x';
2930 else if (number[1] != 'x') {
2931 memmove(number + 2, number,
2932 strlen(number) + 1);
2933 number[0] = '0';
2934 number[1] = 'x';
2935 len += 2;
2936 }
2937
Victor Stinner4a587072013-11-19 12:54:53 +01002938 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002939 return NULL;
2940 break;
2941 }
2942
2943 case 's':
2944 {
2945 /* UTF-8 */
2946 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002947 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002948 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002949 break;
2950 }
2951
2952 case 'U':
2953 {
2954 PyObject *obj = va_arg(*vargs, PyObject *);
2955 assert(obj && _PyUnicode_CHECK(obj));
2956
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002957 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002958 return NULL;
2959 break;
2960 }
2961
2962 case 'V':
2963 {
2964 PyObject *obj = va_arg(*vargs, PyObject *);
2965 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002966 if (obj) {
2967 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002969 return NULL;
2970 }
2971 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002972 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002973 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002974 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002975 }
2976 break;
2977 }
2978
2979 case 'S':
2980 {
2981 PyObject *obj = va_arg(*vargs, PyObject *);
2982 PyObject *str;
2983 assert(obj);
2984 str = PyObject_Str(obj);
2985 if (!str)
2986 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002987 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002988 Py_DECREF(str);
2989 return NULL;
2990 }
2991 Py_DECREF(str);
2992 break;
2993 }
2994
2995 case 'R':
2996 {
2997 PyObject *obj = va_arg(*vargs, PyObject *);
2998 PyObject *repr;
2999 assert(obj);
3000 repr = PyObject_Repr(obj);
3001 if (!repr)
3002 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003003 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003004 Py_DECREF(repr);
3005 return NULL;
3006 }
3007 Py_DECREF(repr);
3008 break;
3009 }
3010
3011 case 'A':
3012 {
3013 PyObject *obj = va_arg(*vargs, PyObject *);
3014 PyObject *ascii;
3015 assert(obj);
3016 ascii = PyObject_ASCII(obj);
3017 if (!ascii)
3018 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003019 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003020 Py_DECREF(ascii);
3021 return NULL;
3022 }
3023 Py_DECREF(ascii);
3024 break;
3025 }
3026
3027 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003028 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003029 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003030 break;
3031
3032 default:
3033 /* if we stumble upon an unknown formatting code, copy the rest
3034 of the format string to the output string. (we cannot just
3035 skip the code, since there's no way to know what's in the
3036 argument list) */
3037 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003038 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003039 return NULL;
3040 f = p+len;
3041 return f;
3042 }
3043
3044 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003045 return f;
3046}
3047
Walter Dörwaldd2034312007-05-18 16:29:38 +00003048PyObject *
3049PyUnicode_FromFormatV(const char *format, va_list vargs)
3050{
Victor Stinnere215d962012-10-06 23:03:36 +02003051 va_list vargs2;
3052 const char *f;
3053 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003054
Victor Stinner8f674cc2013-04-17 23:02:17 +02003055 _PyUnicodeWriter_Init(&writer);
3056 writer.min_length = strlen(format) + 100;
3057 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003058
Benjamin Peterson0c212142016-09-20 20:39:33 -07003059 // Copy varags to be able to pass a reference to a subfunction.
3060 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003061
3062 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003063 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003064 f = unicode_fromformat_arg(&writer, f, &vargs2);
3065 if (f == NULL)
3066 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003068 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003069 const char *p;
3070 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003071
Victor Stinnere215d962012-10-06 23:03:36 +02003072 p = f;
3073 do
3074 {
3075 if ((unsigned char)*p > 127) {
3076 PyErr_Format(PyExc_ValueError,
3077 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3078 "string, got a non-ASCII byte: 0x%02x",
3079 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003080 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003081 }
3082 p++;
3083 }
3084 while (*p != '\0' && *p != '%');
3085 len = p - f;
3086
3087 if (*p == '\0')
3088 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003089
3090 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003091 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003092
3093 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003094 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003095 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003096 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003097 return _PyUnicodeWriter_Finish(&writer);
3098
3099 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003100 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003101 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003102 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003103}
3104
Walter Dörwaldd2034312007-05-18 16:29:38 +00003105PyObject *
3106PyUnicode_FromFormat(const char *format, ...)
3107{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003108 PyObject* ret;
3109 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003110
3111#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003112 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003113#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003114 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003115#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003116 ret = PyUnicode_FromFormatV(format, vargs);
3117 va_end(vargs);
3118 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003119}
3120
Serhiy Storchakac46db922018-10-23 22:58:24 +03003121static Py_ssize_t
3122unicode_get_widechar_size(PyObject *unicode)
3123{
3124 Py_ssize_t res;
3125
3126 assert(unicode != NULL);
3127 assert(_PyUnicode_CHECK(unicode));
3128
3129 if (_PyUnicode_WSTR(unicode) != NULL) {
3130 return PyUnicode_WSTR_LENGTH(unicode);
3131 }
3132 assert(PyUnicode_IS_READY(unicode));
3133
3134 res = _PyUnicode_LENGTH(unicode);
3135#if SIZEOF_WCHAR_T == 2
3136 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3137 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3138 const Py_UCS4 *end = s + res;
3139 for (; s < end; ++s) {
3140 if (*s > 0xFFFF) {
3141 ++res;
3142 }
3143 }
3144 }
3145#endif
3146 return res;
3147}
3148
3149static void
3150unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3151{
3152 const wchar_t *wstr;
3153
3154 assert(unicode != NULL);
3155 assert(_PyUnicode_CHECK(unicode));
3156
3157 wstr = _PyUnicode_WSTR(unicode);
3158 if (wstr != NULL) {
3159 memcpy(w, wstr, size * sizeof(wchar_t));
3160 return;
3161 }
3162 assert(PyUnicode_IS_READY(unicode));
3163
3164 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3165 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3166 for (; size--; ++s, ++w) {
3167 *w = *s;
3168 }
3169 }
3170 else {
3171#if SIZEOF_WCHAR_T == 4
3172 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3173 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3174 for (; size--; ++s, ++w) {
3175 *w = *s;
3176 }
3177#else
3178 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3179 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3180 for (; size--; ++s, ++w) {
3181 Py_UCS4 ch = *s;
3182 if (ch > 0xFFFF) {
3183 assert(ch <= MAX_UNICODE);
3184 /* encode surrogate pair in this case */
3185 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3186 if (!size--)
3187 break;
3188 *w = Py_UNICODE_LOW_SURROGATE(ch);
3189 }
3190 else {
3191 *w = ch;
3192 }
3193 }
3194#endif
3195 }
3196}
3197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003198#ifdef HAVE_WCHAR_H
3199
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003200/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003201
Victor Stinnerd88d9832011-09-06 02:00:05 +02003202 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003203 character) required to convert the unicode object. Ignore size argument.
3204
Victor Stinnerd88d9832011-09-06 02:00:05 +02003205 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003206 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003207 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003208Py_ssize_t
3209PyUnicode_AsWideChar(PyObject *unicode,
3210 wchar_t *w,
3211 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003212{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003213 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003214
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003215 if (unicode == NULL) {
3216 PyErr_BadInternalCall();
3217 return -1;
3218 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003219 if (!PyUnicode_Check(unicode)) {
3220 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003222 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003223
3224 res = unicode_get_widechar_size(unicode);
3225 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003226 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003227 }
3228
3229 if (size > res) {
3230 size = res + 1;
3231 }
3232 else {
3233 res = size;
3234 }
3235 unicode_copy_as_widechar(unicode, w, size);
3236 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003237}
3238
Victor Stinner137c34c2010-09-29 10:25:54 +00003239wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003240PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003241 Py_ssize_t *size)
3242{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003243 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003244 Py_ssize_t buflen;
3245
3246 if (unicode == NULL) {
3247 PyErr_BadInternalCall();
3248 return NULL;
3249 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003252 return NULL;
3253 }
3254
Serhiy Storchakac46db922018-10-23 22:58:24 +03003255 buflen = unicode_get_widechar_size(unicode);
3256 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003257 if (buffer == NULL) {
3258 PyErr_NoMemory();
3259 return NULL;
3260 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003261 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3262 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003263 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003264 }
3265 else if (wcslen(buffer) != (size_t)buflen) {
3266 PyMem_FREE(buffer);
3267 PyErr_SetString(PyExc_ValueError,
3268 "embedded null character");
3269 return NULL;
3270 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003271 return buffer;
3272}
3273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003274#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003276int
3277_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3278{
3279 wchar_t **p = (wchar_t **)ptr;
3280 if (obj == NULL) {
3281#if !USE_UNICODE_WCHAR_CACHE
3282 PyMem_Free(*p);
3283#endif /* USE_UNICODE_WCHAR_CACHE */
3284 *p = NULL;
3285 return 1;
3286 }
3287 if (PyUnicode_Check(obj)) {
3288#if USE_UNICODE_WCHAR_CACHE
3289_Py_COMP_DIAG_PUSH
3290_Py_COMP_DIAG_IGNORE_DEPR_DECLS
3291 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3292 if (*p == NULL) {
3293 return 0;
3294 }
3295 return 1;
3296_Py_COMP_DIAG_POP
3297#else /* USE_UNICODE_WCHAR_CACHE */
3298 *p = PyUnicode_AsWideCharString(obj, NULL);
3299 if (*p == NULL) {
3300 return 0;
3301 }
3302 return Py_CLEANUP_SUPPORTED;
3303#endif /* USE_UNICODE_WCHAR_CACHE */
3304 }
3305 PyErr_Format(PyExc_TypeError,
3306 "argument must be str, not %.50s",
3307 obj->ob_type->tp_name);
3308 return 0;
3309}
3310
3311int
3312_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3313{
3314 wchar_t **p = (wchar_t **)ptr;
3315 if (obj == NULL) {
3316#if !USE_UNICODE_WCHAR_CACHE
3317 PyMem_Free(*p);
3318#endif /* USE_UNICODE_WCHAR_CACHE */
3319 *p = NULL;
3320 return 1;
3321 }
3322 if (obj == Py_None) {
3323 *p = NULL;
3324 return 1;
3325 }
3326 if (PyUnicode_Check(obj)) {
3327#if USE_UNICODE_WCHAR_CACHE
3328_Py_COMP_DIAG_PUSH
3329_Py_COMP_DIAG_IGNORE_DEPR_DECLS
3330 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3331 if (*p == NULL) {
3332 return 0;
3333 }
3334 return 1;
3335_Py_COMP_DIAG_POP
3336#else /* USE_UNICODE_WCHAR_CACHE */
3337 *p = PyUnicode_AsWideCharString(obj, NULL);
3338 if (*p == NULL) {
3339 return 0;
3340 }
3341 return Py_CLEANUP_SUPPORTED;
3342#endif /* USE_UNICODE_WCHAR_CACHE */
3343 }
3344 PyErr_Format(PyExc_TypeError,
3345 "argument must be str or None, not %.50s",
3346 obj->ob_type->tp_name);
3347 return 0;
3348}
3349
Alexander Belopolsky40018472011-02-26 01:02:56 +00003350PyObject *
3351PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003352{
Victor Stinner8faf8212011-12-08 22:14:11 +01003353 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 PyErr_SetString(PyExc_ValueError,
3355 "chr() arg not in range(0x110000)");
3356 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003357 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003358
Victor Stinner985a82a2014-01-03 12:53:47 +01003359 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003360}
3361
Alexander Belopolsky40018472011-02-26 01:02:56 +00003362PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003363PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003365 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003366 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003367 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003368 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003369 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 Py_INCREF(obj);
3371 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003372 }
3373 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003374 /* For a Unicode subtype that's not a Unicode object,
3375 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003376 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003377 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003378 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003379 "Can't convert '%.100s' object to str implicitly",
3380 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003381 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003382}
3383
Alexander Belopolsky40018472011-02-26 01:02:56 +00003384PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003385PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003386 const char *encoding,
3387 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003388{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003389 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003390 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003391
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 PyErr_BadInternalCall();
3394 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003396
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003397 /* Decoding bytes objects is the most common case and should be fast */
3398 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003399 if (PyBytes_GET_SIZE(obj) == 0) {
3400 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3401 return NULL;
3402 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003403 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003404 }
3405 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003406 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3407 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003408 }
3409
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003410 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 PyErr_SetString(PyExc_TypeError,
3412 "decoding str is not supported");
3413 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003414 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003415
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003416 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3417 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3418 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003419 "decoding to str: need a bytes-like object, %.80s found",
3420 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003421 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003422 }
Tim Petersced69f82003-09-16 20:30:58 +00003423
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003424 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003425 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003426 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3427 return NULL;
3428 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003429 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003431
Serhiy Storchaka05997252013-01-26 12:14:02 +02003432 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003433 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003434 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435}
3436
Victor Stinnerebe17e02016-10-12 13:57:45 +02003437/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3438 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3439 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003440int
3441_Py_normalize_encoding(const char *encoding,
3442 char *lower,
3443 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003445 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003446 char *l;
3447 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003448 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449
Victor Stinner942889a2016-09-05 15:40:10 -07003450 assert(encoding != NULL);
3451
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003452 e = encoding;
3453 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003454 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003455 punct = 0;
3456 while (1) {
3457 char c = *e;
3458 if (c == 0) {
3459 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003460 }
Victor Stinner942889a2016-09-05 15:40:10 -07003461
3462 if (Py_ISALNUM(c) || c == '.') {
3463 if (punct && l != lower) {
3464 if (l == l_end) {
3465 return 0;
3466 }
3467 *l++ = '_';
3468 }
3469 punct = 0;
3470
3471 if (l == l_end) {
3472 return 0;
3473 }
3474 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003475 }
3476 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003477 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003478 }
Victor Stinner942889a2016-09-05 15:40:10 -07003479
3480 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003481 }
3482 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003483 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003484}
3485
Alexander Belopolsky40018472011-02-26 01:02:56 +00003486PyObject *
3487PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003488 Py_ssize_t size,
3489 const char *encoding,
3490 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003491{
3492 PyObject *buffer = NULL, *unicode;
3493 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003494 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3495
Victor Stinner22eb6892019-06-26 00:51:05 +02003496 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3497 return NULL;
3498 }
3499
Victor Stinnered076ed2019-06-26 01:49:32 +02003500 if (size == 0) {
3501 _Py_RETURN_UNICODE_EMPTY();
3502 }
3503
Victor Stinner942889a2016-09-05 15:40:10 -07003504 if (encoding == NULL) {
3505 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3506 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003507
Fred Drakee4315f52000-05-09 19:53:39 +00003508 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003509 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3510 char *lower = buflower;
3511
3512 /* Fast paths */
3513 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3514 lower += 3;
3515 if (*lower == '_') {
3516 /* Match "utf8" and "utf_8" */
3517 lower++;
3518 }
3519
3520 if (lower[0] == '8' && lower[1] == 0) {
3521 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3522 }
3523 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3524 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3525 }
3526 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3527 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3528 }
3529 }
3530 else {
3531 if (strcmp(lower, "ascii") == 0
3532 || strcmp(lower, "us_ascii") == 0) {
3533 return PyUnicode_DecodeASCII(s, size, errors);
3534 }
Steve Dowercc16be82016-09-08 10:35:16 -07003535 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003536 else if (strcmp(lower, "mbcs") == 0) {
3537 return PyUnicode_DecodeMBCS(s, size, errors);
3538 }
3539 #endif
3540 else if (strcmp(lower, "latin1") == 0
3541 || strcmp(lower, "latin_1") == 0
3542 || strcmp(lower, "iso_8859_1") == 0
3543 || strcmp(lower, "iso8859_1") == 0) {
3544 return PyUnicode_DecodeLatin1(s, size, errors);
3545 }
3546 }
Victor Stinner37296e82010-06-10 13:36:23 +00003547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548
3549 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003550 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003551 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003552 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003553 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554 if (buffer == NULL)
3555 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003556 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 if (unicode == NULL)
3558 goto onError;
3559 if (!PyUnicode_Check(unicode)) {
3560 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003561 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003562 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003563 encoding,
3564 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 Py_DECREF(unicode);
3566 goto onError;
3567 }
3568 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003569 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003570
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572 Py_XDECREF(buffer);
3573 return NULL;
3574}
3575
Alexander Belopolsky40018472011-02-26 01:02:56 +00003576PyObject *
3577PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003578 const char *encoding,
3579 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003580{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003581 if (!PyUnicode_Check(unicode)) {
3582 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003583 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003584 }
3585
Serhiy Storchaka00939072016-10-27 21:05:49 +03003586 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3587 "PyUnicode_AsDecodedObject() is deprecated; "
3588 "use PyCodec_Decode() to decode from str", 1) < 0)
3589 return NULL;
3590
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003591 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003592 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003593
3594 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003595 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003596}
3597
Alexander Belopolsky40018472011-02-26 01:02:56 +00003598PyObject *
3599PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003600 const char *encoding,
3601 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003602{
3603 PyObject *v;
3604
3605 if (!PyUnicode_Check(unicode)) {
3606 PyErr_BadArgument();
3607 goto onError;
3608 }
3609
Serhiy Storchaka00939072016-10-27 21:05:49 +03003610 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3611 "PyUnicode_AsDecodedUnicode() is deprecated; "
3612 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3613 return NULL;
3614
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003615 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003617
3618 /* Decode via the codec registry */
3619 v = PyCodec_Decode(unicode, encoding, errors);
3620 if (v == NULL)
3621 goto onError;
3622 if (!PyUnicode_Check(v)) {
3623 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003624 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003625 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003626 encoding,
3627 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003628 Py_DECREF(v);
3629 goto onError;
3630 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003631 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003632
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634 return NULL;
3635}
3636
Alexander Belopolsky40018472011-02-26 01:02:56 +00003637PyObject *
3638PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003639 Py_ssize_t size,
3640 const char *encoding,
3641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642{
3643 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003644
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003645 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3649 Py_DECREF(unicode);
3650 return v;
3651}
3652
Alexander Belopolsky40018472011-02-26 01:02:56 +00003653PyObject *
3654PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003655 const char *encoding,
3656 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003657{
3658 PyObject *v;
3659
3660 if (!PyUnicode_Check(unicode)) {
3661 PyErr_BadArgument();
3662 goto onError;
3663 }
3664
Serhiy Storchaka00939072016-10-27 21:05:49 +03003665 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3666 "PyUnicode_AsEncodedObject() is deprecated; "
3667 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3668 "or PyCodec_Encode() for generic encoding", 1) < 0)
3669 return NULL;
3670
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003671 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003673
3674 /* Encode via the codec registry */
3675 v = PyCodec_Encode(unicode, encoding, errors);
3676 if (v == NULL)
3677 goto onError;
3678 return v;
3679
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003681 return NULL;
3682}
3683
Victor Stinner1b579672011-12-17 05:47:23 +01003684
Victor Stinner2cba6b82018-01-10 22:46:15 +01003685static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003686unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003687 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003688{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003689 Py_ssize_t wlen;
3690 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3691 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003692 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003693 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003694
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003695 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003696 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003697 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003698 return NULL;
3699 }
3700
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003701 char *str;
3702 size_t error_pos;
3703 const char *reason;
3704 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003705 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003706 PyMem_Free(wstr);
3707
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003708 if (res != 0) {
3709 if (res == -2) {
3710 PyObject *exc;
3711 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3712 "locale", unicode,
3713 (Py_ssize_t)error_pos,
3714 (Py_ssize_t)(error_pos+1),
3715 reason);
3716 if (exc != NULL) {
3717 PyCodec_StrictErrors(exc);
3718 Py_DECREF(exc);
3719 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003720 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003721 else if (res == -3) {
3722 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3723 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003724 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003725 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003726 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003727 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003728 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003729
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003730 PyObject *bytes = PyBytes_FromString(str);
3731 PyMem_RawFree(str);
3732 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003733}
3734
Victor Stinnerad158722010-10-27 00:25:46 +00003735PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003736PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3737{
Victor Stinner709d23d2019-05-02 14:56:30 -04003738 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3739 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003740}
3741
3742PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003743PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003744{
Victor Stinner81a7be32020-04-14 15:14:01 +02003745 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003746 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3747 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003748 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003749 fs_codec->error_handler,
3750 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003751 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003752#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003753 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003754 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003755 fs_codec->encoding,
3756 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003757 }
Victor Stinnerad158722010-10-27 00:25:46 +00003758#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003759 else {
3760 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3761 machinery is not ready and so cannot be used:
3762 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003763 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3764 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003765 assert(filesystem_errors != NULL);
3766 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3767 assert(errors != _Py_ERROR_UNKNOWN);
3768#ifdef _Py_FORCE_UTF8_FS_ENCODING
3769 return unicode_encode_utf8(unicode, errors, NULL);
3770#else
3771 return unicode_encode_locale(unicode, errors, 0);
3772#endif
3773 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003774}
3775
Alexander Belopolsky40018472011-02-26 01:02:56 +00003776PyObject *
3777PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003778 const char *encoding,
3779 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780{
3781 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003782 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003783
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 if (!PyUnicode_Check(unicode)) {
3785 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787 }
Fred Drakee4315f52000-05-09 19:53:39 +00003788
Victor Stinner22eb6892019-06-26 00:51:05 +02003789 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3790 return NULL;
3791 }
3792
Victor Stinner942889a2016-09-05 15:40:10 -07003793 if (encoding == NULL) {
3794 return _PyUnicode_AsUTF8String(unicode, errors);
3795 }
3796
Fred Drakee4315f52000-05-09 19:53:39 +00003797 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003798 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3799 char *lower = buflower;
3800
3801 /* Fast paths */
3802 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3803 lower += 3;
3804 if (*lower == '_') {
3805 /* Match "utf8" and "utf_8" */
3806 lower++;
3807 }
3808
3809 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003811 }
3812 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3813 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3814 }
3815 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3816 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3817 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003818 }
Victor Stinner942889a2016-09-05 15:40:10 -07003819 else {
3820 if (strcmp(lower, "ascii") == 0
3821 || strcmp(lower, "us_ascii") == 0) {
3822 return _PyUnicode_AsASCIIString(unicode, errors);
3823 }
Steve Dowercc16be82016-09-08 10:35:16 -07003824#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003825 else if (strcmp(lower, "mbcs") == 0) {
3826 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3827 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003828#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003829 else if (strcmp(lower, "latin1") == 0 ||
3830 strcmp(lower, "latin_1") == 0 ||
3831 strcmp(lower, "iso_8859_1") == 0 ||
3832 strcmp(lower, "iso8859_1") == 0) {
3833 return _PyUnicode_AsLatin1String(unicode, errors);
3834 }
3835 }
Victor Stinner37296e82010-06-10 13:36:23 +00003836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837
3838 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003839 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003841 return NULL;
3842
3843 /* The normal path */
3844 if (PyBytes_Check(v))
3845 return v;
3846
3847 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003848 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003849 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003850 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003851
3852 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003853 "encoder %s returned bytearray instead of bytes; "
3854 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003855 encoding);
3856 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003857 Py_DECREF(v);
3858 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003859 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003860
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003861 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3862 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003863 Py_DECREF(v);
3864 return b;
3865 }
3866
3867 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003868 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003869 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003870 encoding,
3871 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003872 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003873 return NULL;
3874}
3875
Alexander Belopolsky40018472011-02-26 01:02:56 +00003876PyObject *
3877PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003878 const char *encoding,
3879 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003880{
3881 PyObject *v;
3882
3883 if (!PyUnicode_Check(unicode)) {
3884 PyErr_BadArgument();
3885 goto onError;
3886 }
3887
Serhiy Storchaka00939072016-10-27 21:05:49 +03003888 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3889 "PyUnicode_AsEncodedUnicode() is deprecated; "
3890 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3891 return NULL;
3892
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003893 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003895
3896 /* Encode via the codec registry */
3897 v = PyCodec_Encode(unicode, encoding, errors);
3898 if (v == NULL)
3899 goto onError;
3900 if (!PyUnicode_Check(v)) {
3901 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003902 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003903 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003904 encoding,
3905 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003906 Py_DECREF(v);
3907 goto onError;
3908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003910
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 return NULL;
3913}
3914
Victor Stinner2cba6b82018-01-10 22:46:15 +01003915static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003916unicode_decode_locale(const char *str, Py_ssize_t len,
3917 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003918{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003919 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3920 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003921 return NULL;
3922 }
3923
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003924 wchar_t *wstr;
3925 size_t wlen;
3926 const char *reason;
3927 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003928 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003929 if (res != 0) {
3930 if (res == -2) {
3931 PyObject *exc;
3932 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3933 "locale", str, len,
3934 (Py_ssize_t)wlen,
3935 (Py_ssize_t)(wlen + 1),
3936 reason);
3937 if (exc != NULL) {
3938 PyCodec_StrictErrors(exc);
3939 Py_DECREF(exc);
3940 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003941 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003942 else if (res == -3) {
3943 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3944 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003945 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003946 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003947 }
Victor Stinner2f197072011-12-17 07:08:30 +01003948 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003949 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003950
3951 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3952 PyMem_RawFree(wstr);
3953 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003954}
3955
3956PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003957PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3958 const char *errors)
3959{
Victor Stinner709d23d2019-05-02 14:56:30 -04003960 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3961 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003962}
3963
3964PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003965PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003966{
3967 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003968 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3969 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003970}
3971
3972
3973PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003974PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003975 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003976 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3977}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003978
Christian Heimes5894ba72007-11-04 11:43:14 +00003979PyObject*
3980PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3981{
Victor Stinner81a7be32020-04-14 15:14:01 +02003982 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003983 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3984 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003985 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003986 fs_codec->error_handler,
3987 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003988 NULL);
3989 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003990#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003991 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003992 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003993 fs_codec->encoding,
3994 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003995 }
Victor Stinnerad158722010-10-27 00:25:46 +00003996#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003997 else {
3998 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3999 machinery is not ready and so cannot be used:
4000 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004001 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4002 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004003 assert(filesystem_errors != NULL);
4004 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4005 assert(errors != _Py_ERROR_UNKNOWN);
4006#ifdef _Py_FORCE_UTF8_FS_ENCODING
4007 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4008#else
4009 return unicode_decode_locale(s, size, errors, 0);
4010#endif
4011 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004012}
4013
Martin v. Löwis011e8422009-05-05 04:43:17 +00004014
4015int
4016PyUnicode_FSConverter(PyObject* arg, void* addr)
4017{
Brett Cannonec6ce872016-09-06 15:50:29 -07004018 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004019 PyObject *output = NULL;
4020 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004021 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004022 if (arg == NULL) {
4023 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004024 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004025 return 1;
4026 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004027 path = PyOS_FSPath(arg);
4028 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004029 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004030 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004031 if (PyBytes_Check(path)) {
4032 output = path;
4033 }
4034 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4035 output = PyUnicode_EncodeFSDefault(path);
4036 Py_DECREF(path);
4037 if (!output) {
4038 return 0;
4039 }
4040 assert(PyBytes_Check(output));
4041 }
4042
Victor Stinner0ea2a462010-04-30 00:22:08 +00004043 size = PyBytes_GET_SIZE(output);
4044 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004045 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004046 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004047 Py_DECREF(output);
4048 return 0;
4049 }
4050 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004051 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004052}
4053
4054
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004055int
4056PyUnicode_FSDecoder(PyObject* arg, void* addr)
4057{
Brett Cannona5711202016-09-06 19:36:01 -07004058 int is_buffer = 0;
4059 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004060 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004061 if (arg == NULL) {
4062 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004063 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004064 return 1;
4065 }
Brett Cannona5711202016-09-06 19:36:01 -07004066
4067 is_buffer = PyObject_CheckBuffer(arg);
4068 if (!is_buffer) {
4069 path = PyOS_FSPath(arg);
4070 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004071 return 0;
4072 }
Brett Cannona5711202016-09-06 19:36:01 -07004073 }
4074 else {
4075 path = arg;
4076 Py_INCREF(arg);
4077 }
4078
4079 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004080 output = path;
4081 }
4082 else if (PyBytes_Check(path) || is_buffer) {
4083 PyObject *path_bytes = NULL;
4084
4085 if (!PyBytes_Check(path) &&
4086 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004087 "path should be string, bytes, or os.PathLike, not %.200s",
4088 Py_TYPE(arg)->tp_name)) {
4089 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004090 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004091 }
4092 path_bytes = PyBytes_FromObject(path);
4093 Py_DECREF(path);
4094 if (!path_bytes) {
4095 return 0;
4096 }
4097 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4098 PyBytes_GET_SIZE(path_bytes));
4099 Py_DECREF(path_bytes);
4100 if (!output) {
4101 return 0;
4102 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004103 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004104 else {
4105 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004106 "path should be string, bytes, or os.PathLike, not %.200s",
4107 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004108 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004109 return 0;
4110 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004111 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004112 Py_DECREF(output);
4113 return 0;
4114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004116 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004117 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004118 Py_DECREF(output);
4119 return 0;
4120 }
4121 *(PyObject**)addr = output;
4122 return Py_CLEANUP_SUPPORTED;
4123}
4124
4125
Inada Naoki02a4d572020-02-27 13:48:59 +09004126static int unicode_fill_utf8(PyObject *unicode);
4127
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004128const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004130{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004131 if (!PyUnicode_Check(unicode)) {
4132 PyErr_BadArgument();
4133 return NULL;
4134 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004135 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004136 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004138 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004139 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140 return NULL;
4141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004142 }
4143
4144 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004145 *psize = PyUnicode_UTF8_LENGTH(unicode);
4146 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004147}
4148
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004149const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4153}
4154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004155Py_UNICODE *
4156PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 if (!PyUnicode_Check(unicode)) {
4159 PyErr_BadArgument();
4160 return NULL;
4161 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004162 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4163 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004165 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004166 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167
Serhiy Storchakac46db922018-10-23 22:58:24 +03004168 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4169 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4170 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004173 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4174 if (w == NULL) {
4175 PyErr_NoMemory();
4176 return NULL;
4177 }
4178 unicode_copy_as_widechar(unicode, w, wlen + 1);
4179 _PyUnicode_WSTR(unicode) = w;
4180 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4181 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 }
4183 }
4184 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004185 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004186 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004187}
4188
Inada Naoki2c4928d2020-06-17 20:09:44 +09004189/* Deprecated APIs */
4190
4191_Py_COMP_DIAG_PUSH
4192_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4193
Alexander Belopolsky40018472011-02-26 01:02:56 +00004194Py_UNICODE *
4195PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198}
4199
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004200const Py_UNICODE *
4201_PyUnicode_AsUnicode(PyObject *unicode)
4202{
4203 Py_ssize_t size;
4204 const Py_UNICODE *wstr;
4205
4206 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4207 if (wstr && wcslen(wstr) != (size_t)size) {
4208 PyErr_SetString(PyExc_ValueError, "embedded null character");
4209 return NULL;
4210 }
4211 return wstr;
4212}
4213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214
Alexander Belopolsky40018472011-02-26 01:02:56 +00004215Py_ssize_t
4216PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217{
4218 if (!PyUnicode_Check(unicode)) {
4219 PyErr_BadArgument();
4220 goto onError;
4221 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004222 if (_PyUnicode_WSTR(unicode) == NULL) {
4223 if (PyUnicode_AsUnicode(unicode) == NULL)
4224 goto onError;
4225 }
4226 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 return -1;
4230}
4231
Inada Naoki2c4928d2020-06-17 20:09:44 +09004232_Py_COMP_DIAG_POP
4233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004234Py_ssize_t
4235PyUnicode_GetLength(PyObject *unicode)
4236{
Victor Stinner07621332012-06-16 04:53:46 +02004237 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 PyErr_BadArgument();
4239 return -1;
4240 }
Victor Stinner07621332012-06-16 04:53:46 +02004241 if (PyUnicode_READY(unicode) == -1)
4242 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243 return PyUnicode_GET_LENGTH(unicode);
4244}
4245
4246Py_UCS4
4247PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4248{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004249 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004250 int kind;
4251
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004252 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004253 PyErr_BadArgument();
4254 return (Py_UCS4)-1;
4255 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004256 if (PyUnicode_READY(unicode) == -1) {
4257 return (Py_UCS4)-1;
4258 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004259 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004260 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004261 return (Py_UCS4)-1;
4262 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004263 data = PyUnicode_DATA(unicode);
4264 kind = PyUnicode_KIND(unicode);
4265 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266}
4267
4268int
4269PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4270{
4271 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004272 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 return -1;
4274 }
Victor Stinner488fa492011-12-12 00:01:39 +01004275 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004276 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004277 PyErr_SetString(PyExc_IndexError, "string index out of range");
4278 return -1;
4279 }
Victor Stinner488fa492011-12-12 00:01:39 +01004280 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004281 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004282 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4283 PyErr_SetString(PyExc_ValueError, "character out of range");
4284 return -1;
4285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4287 index, ch);
4288 return 0;
4289}
4290
Alexander Belopolsky40018472011-02-26 01:02:56 +00004291const char *
4292PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004293{
Victor Stinner42cb4622010-09-01 19:39:01 +00004294 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004295}
4296
Victor Stinner554f3f02010-06-16 23:33:54 +00004297/* create or adjust a UnicodeDecodeError */
4298static void
4299make_decode_exception(PyObject **exceptionObject,
4300 const char *encoding,
4301 const char *input, Py_ssize_t length,
4302 Py_ssize_t startpos, Py_ssize_t endpos,
4303 const char *reason)
4304{
4305 if (*exceptionObject == NULL) {
4306 *exceptionObject = PyUnicodeDecodeError_Create(
4307 encoding, input, length, startpos, endpos, reason);
4308 }
4309 else {
4310 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4311 goto onError;
4312 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4313 goto onError;
4314 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4315 goto onError;
4316 }
4317 return;
4318
4319onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004320 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004321}
4322
Steve Dowercc16be82016-09-08 10:35:16 -07004323#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004324static int
4325widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4326{
4327 if (newsize > *size) {
4328 wchar_t *newbuf = *buf;
4329 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4330 PyErr_NoMemory();
4331 return -1;
4332 }
4333 *buf = newbuf;
4334 }
4335 *size = newsize;
4336 return 0;
4337}
4338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339/* error handling callback helper:
4340 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004341 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 and adjust various state variables.
4343 return 0 on success, -1 on error
4344*/
4345
Alexander Belopolsky40018472011-02-26 01:02:56 +00004346static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347unicode_decode_call_errorhandler_wchar(
4348 const char *errors, PyObject **errorHandler,
4349 const char *encoding, const char *reason,
4350 const char **input, const char **inend, Py_ssize_t *startinpos,
4351 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004352 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004354 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355
4356 PyObject *restuple = NULL;
4357 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004358 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004359 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004360 Py_ssize_t requiredsize;
4361 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004362 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363 wchar_t *repwstr;
4364 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365
4366 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 *errorHandler = PyCodec_LookupError(errors);
4368 if (*errorHandler == NULL)
4369 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 }
4371
Victor Stinner554f3f02010-06-16 23:33:54 +00004372 make_decode_exception(exceptionObject,
4373 encoding,
4374 *input, *inend - *input,
4375 *startinpos, *endinpos,
4376 reason);
4377 if (*exceptionObject == NULL)
4378 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379
Petr Viktorinffd97532020-02-11 17:46:57 +01004380 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004383 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004384 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004387 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389
4390 /* Copy back the bytes variables, which might have been modified by the
4391 callback */
4392 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4393 if (!inputobj)
4394 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395 *input = PyBytes_AS_STRING(inputobj);
4396 insize = PyBytes_GET_SIZE(inputobj);
4397 *inend = *input + insize;
4398 /* we can DECREF safely, as the exception has another reference,
4399 so the object won't go away. */
4400 Py_DECREF(inputobj);
4401
4402 if (newpos<0)
4403 newpos = insize+newpos;
4404 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004405 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 goto onError;
4407 }
4408
4409 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4410 if (repwstr == NULL)
4411 goto onError;
4412 /* need more space? (at least enough for what we
4413 have+the replacement+the rest of the string (starting
4414 at the new input position), so we won't have to check space
4415 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004416 requiredsize = *outpos;
4417 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4418 goto overflow;
4419 requiredsize += repwlen;
4420 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4421 goto overflow;
4422 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004423 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004425 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004426 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004427 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004428 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004429 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004431 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004432 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 *endinpos = newpos;
4434 *inptr = *input + newpos;
4435
4436 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004437 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438 return 0;
4439
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004440 overflow:
4441 PyErr_SetString(PyExc_OverflowError,
4442 "decoded result is too long for a Python string");
4443
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 onError:
4445 Py_XDECREF(restuple);
4446 return -1;
4447}
Steve Dowercc16be82016-09-08 10:35:16 -07004448#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004449
4450static int
4451unicode_decode_call_errorhandler_writer(
4452 const char *errors, PyObject **errorHandler,
4453 const char *encoding, const char *reason,
4454 const char **input, const char **inend, Py_ssize_t *startinpos,
4455 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4456 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4457{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004458 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004459
4460 PyObject *restuple = NULL;
4461 PyObject *repunicode = NULL;
4462 Py_ssize_t insize;
4463 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004464 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004465 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004467 int need_to_grow = 0;
4468 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469
4470 if (*errorHandler == NULL) {
4471 *errorHandler = PyCodec_LookupError(errors);
4472 if (*errorHandler == NULL)
4473 goto onError;
4474 }
4475
4476 make_decode_exception(exceptionObject,
4477 encoding,
4478 *input, *inend - *input,
4479 *startinpos, *endinpos,
4480 reason);
4481 if (*exceptionObject == NULL)
4482 goto onError;
4483
Petr Viktorinffd97532020-02-11 17:46:57 +01004484 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004485 if (restuple == NULL)
4486 goto onError;
4487 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004488 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004489 goto onError;
4490 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004491 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004492 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004493
4494 /* Copy back the bytes variables, which might have been modified by the
4495 callback */
4496 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4497 if (!inputobj)
4498 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004499 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004500 *input = PyBytes_AS_STRING(inputobj);
4501 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004502 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004503 /* we can DECREF safely, as the exception has another reference,
4504 so the object won't go away. */
4505 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004509 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004510 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004512 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513
Victor Stinner170ca6f2013-04-18 00:25:28 +02004514 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004515 if (replen > 1) {
4516 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004517 need_to_grow = 1;
4518 }
4519 new_inptr = *input + newpos;
4520 if (*inend - new_inptr > remain) {
4521 /* We don't know the decoding algorithm here so we make the worst
4522 assumption that one byte decodes to one unicode character.
4523 If unfortunately one byte could decode to more unicode characters,
4524 the decoder may write out-of-bound then. Is it possible for the
4525 algorithms using this function? */
4526 writer->min_length += *inend - new_inptr - remain;
4527 need_to_grow = 1;
4528 }
4529 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004530 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004531 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004532 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4533 goto onError;
4534 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004535 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004536 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004539 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004542 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004543 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004547 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548}
4549
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550/* --- UTF-7 Codec -------------------------------------------------------- */
4551
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552/* See RFC2152 for details. We encode conservatively and decode liberally. */
4553
4554/* Three simple macros defining base-64. */
4555
4556/* Is c a base-64 character? */
4557
4558#define IS_BASE64(c) \
4559 (((c) >= 'A' && (c) <= 'Z') || \
4560 ((c) >= 'a' && (c) <= 'z') || \
4561 ((c) >= '0' && (c) <= '9') || \
4562 (c) == '+' || (c) == '/')
4563
4564/* given that c is a base-64 character, what is its base-64 value? */
4565
4566#define FROM_BASE64(c) \
4567 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4568 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4569 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4570 (c) == '+' ? 62 : 63)
4571
4572/* What is the base-64 character of the bottom 6 bits of n? */
4573
4574#define TO_BASE64(n) \
4575 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4576
4577/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4578 * decoded as itself. We are permissive on decoding; the only ASCII
4579 * byte not decoding to itself is the + which begins a base64
4580 * string. */
4581
4582#define DECODE_DIRECT(c) \
4583 ((c) <= 127 && (c) != '+')
4584
4585/* The UTF-7 encoder treats ASCII characters differently according to
4586 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4587 * the above). See RFC2152. This array identifies these different
4588 * sets:
4589 * 0 : "Set D"
4590 * alphanumeric and '(),-./:?
4591 * 1 : "Set O"
4592 * !"#$%&*;<=>@[]^_`{|}
4593 * 2 : "whitespace"
4594 * ht nl cr sp
4595 * 3 : special (must be base64 encoded)
4596 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4597 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598
Tim Petersced69f82003-09-16 20:30:58 +00004599static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600char utf7_category[128] = {
4601/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4602 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4603/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4604 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4605/* sp ! " # $ % & ' ( ) * + , - . / */
4606 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4607/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4609/* @ A B C D E F G H I J K L M N O */
4610 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4611/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4613/* ` a b c d e f g h i j k l m n o */
4614 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4615/* p q r s t u v w x y z { | } ~ del */
4616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617};
4618
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619/* ENCODE_DIRECT: this character should be encoded as itself. The
4620 * answer depends on whether we are encoding set O as itself, and also
4621 * on whether we are encoding whitespace as itself. RFC2152 makes it
4622 * clear that the answers to these questions vary between
4623 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004624
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625#define ENCODE_DIRECT(c, directO, directWS) \
4626 ((c) < 128 && (c) > 0 && \
4627 ((utf7_category[(c)] == 0) || \
4628 (directWS && (utf7_category[(c)] == 2)) || \
4629 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630
Alexander Belopolsky40018472011-02-26 01:02:56 +00004631PyObject *
4632PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004633 Py_ssize_t size,
4634 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004636 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4637}
4638
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639/* The decoder. The only state we preserve is our read position,
4640 * i.e. how many characters we have consumed. So if we end in the
4641 * middle of a shift sequence we have to back off the read position
4642 * and the output to the beginning of the sequence, otherwise we lose
4643 * all the shift state (seen bits, number of bits seen, high
4644 * surrogate). */
4645
Alexander Belopolsky40018472011-02-26 01:02:56 +00004646PyObject *
4647PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004648 Py_ssize_t size,
4649 const char *errors,
4650 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004651{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004653 Py_ssize_t startinpos;
4654 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004656 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 const char *errmsg = "";
4658 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004659 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 unsigned int base64bits = 0;
4661 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004662 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 PyObject *errorHandler = NULL;
4664 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004666 if (size == 0) {
4667 if (consumed)
4668 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004669 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004670 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004672 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004673 _PyUnicodeWriter_Init(&writer);
4674 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675
4676 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 e = s + size;
4678
4679 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004680 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004682 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 if (inShift) { /* in a base-64 section */
4685 if (IS_BASE64(ch)) { /* consume a base-64 character */
4686 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4687 base64bits += 6;
4688 s++;
4689 if (base64bits >= 16) {
4690 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004691 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 base64bits -= 16;
4693 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004694 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004695 if (surrogate) {
4696 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004697 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4698 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004699 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004700 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004702 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 }
4704 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004705 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004706 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 }
4709 }
Victor Stinner551ac952011-11-29 22:58:13 +01004710 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 /* first surrogate */
4712 surrogate = outCh;
4713 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004715 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004716 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717 }
4718 }
4719 }
4720 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004722 if (base64bits > 0) { /* left-over bits */
4723 if (base64bits >= 6) {
4724 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004725 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 errmsg = "partial character in shift sequence";
4727 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004728 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 else {
4730 /* Some bits remain; they should be zero */
4731 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004732 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 errmsg = "non-zero padding bits in shift sequence";
4734 goto utf7Error;
4735 }
4736 }
4737 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004738 if (surrogate && DECODE_DIRECT(ch)) {
4739 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4740 goto onError;
4741 }
4742 surrogate = 0;
4743 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744 /* '-' is absorbed; other terminating
4745 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004746 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004748 }
4749 }
4750 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004751 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 s++; /* consume '+' */
4753 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004754 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004755 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004756 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004757 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004758 else if (s < e && !IS_BASE64(*s)) {
4759 s++;
4760 errmsg = "ill-formed sequence";
4761 goto utf7Error;
4762 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004765 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004766 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004768 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004769 }
4770 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004773 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004774 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 else {
4777 startinpos = s-starts;
4778 s++;
4779 errmsg = "unexpected special character";
4780 goto utf7Error;
4781 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004783utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004785 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 errors, &errorHandler,
4787 "utf7", errmsg,
4788 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004789 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004790 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004791 }
4792
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 /* end of string */
4794
4795 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4796 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004797 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 if (surrogate ||
4799 (base64bits >= 6) ||
4800 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004803 errors, &errorHandler,
4804 "utf7", "unterminated shift sequence",
4805 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004806 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 goto onError;
4808 if (s < e)
4809 goto restart;
4810 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004811 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812
4813 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004814 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004816 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004817 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004818 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004819 writer.kind, writer.data, shiftOutStart);
4820 Py_XDECREF(errorHandler);
4821 Py_XDECREF(exc);
4822 _PyUnicodeWriter_Dealloc(&writer);
4823 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004824 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004825 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004826 }
4827 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004828 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004829 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004830 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 Py_XDECREF(errorHandler);
4833 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004834 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004835
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837 Py_XDECREF(errorHandler);
4838 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004839 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840 return NULL;
4841}
4842
4843
Alexander Belopolsky40018472011-02-26 01:02:56 +00004844PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004845_PyUnicode_EncodeUTF7(PyObject *str,
4846 int base64SetO,
4847 int base64WhiteSpace,
4848 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004849{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004850 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004851 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004852 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004853 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004854 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004855 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004856 unsigned int base64bits = 0;
4857 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004858 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004859 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004860
Benjamin Petersonbac79492012-01-14 13:34:47 -05004861 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004862 return NULL;
4863 kind = PyUnicode_KIND(str);
4864 data = PyUnicode_DATA(str);
4865 len = PyUnicode_GET_LENGTH(str);
4866
4867 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004869
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004870 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004871 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004872 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004873 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874 if (v == NULL)
4875 return NULL;
4876
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004877 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004878 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004879 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004880
Antoine Pitrou244651a2009-05-04 18:56:13 +00004881 if (inShift) {
4882 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4883 /* shifting out */
4884 if (base64bits) { /* output remaining bits */
4885 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4886 base64buffer = 0;
4887 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004888 }
4889 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004890 /* Characters not in the BASE64 set implicitly unshift the sequence
4891 so no '-' is required, except if the character is itself a '-' */
4892 if (IS_BASE64(ch) || ch == '-') {
4893 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004894 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004895 *out++ = (char) ch;
4896 }
4897 else {
4898 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004899 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004900 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004901 else { /* not in a shift sequence */
4902 if (ch == '+') {
4903 *out++ = '+';
4904 *out++ = '-';
4905 }
4906 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4907 *out++ = (char) ch;
4908 }
4909 else {
4910 *out++ = '+';
4911 inShift = 1;
4912 goto encode_char;
4913 }
4914 }
4915 continue;
4916encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004917 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004918 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004919
Antoine Pitrou244651a2009-05-04 18:56:13 +00004920 /* code first surrogate */
4921 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004922 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004923 while (base64bits >= 6) {
4924 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4925 base64bits -= 6;
4926 }
4927 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004928 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004929 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004930 base64bits += 16;
4931 base64buffer = (base64buffer << 16) | ch;
4932 while (base64bits >= 6) {
4933 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4934 base64bits -= 6;
4935 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004936 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004937 if (base64bits)
4938 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4939 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004940 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004941 if (_PyBytes_Resize(&v, out - start) < 0)
4942 return NULL;
4943 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004944}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004945PyObject *
4946PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4947 Py_ssize_t size,
4948 int base64SetO,
4949 int base64WhiteSpace,
4950 const char *errors)
4951{
4952 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004953 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004954 if (tmp == NULL)
4955 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004956 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004957 base64WhiteSpace, errors);
4958 Py_DECREF(tmp);
4959 return result;
4960}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004961
Antoine Pitrou244651a2009-05-04 18:56:13 +00004962#undef IS_BASE64
4963#undef FROM_BASE64
4964#undef TO_BASE64
4965#undef DECODE_DIRECT
4966#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004967
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968/* --- UTF-8 Codec -------------------------------------------------------- */
4969
Alexander Belopolsky40018472011-02-26 01:02:56 +00004970PyObject *
4971PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004972 Py_ssize_t size,
4973 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974{
Walter Dörwald69652032004-09-07 20:24:22 +00004975 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4976}
4977
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978#include "stringlib/asciilib.h"
4979#include "stringlib/codecs.h"
4980#include "stringlib/undef.h"
4981
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004982#include "stringlib/ucs1lib.h"
4983#include "stringlib/codecs.h"
4984#include "stringlib/undef.h"
4985
4986#include "stringlib/ucs2lib.h"
4987#include "stringlib/codecs.h"
4988#include "stringlib/undef.h"
4989
4990#include "stringlib/ucs4lib.h"
4991#include "stringlib/codecs.h"
4992#include "stringlib/undef.h"
4993
Antoine Pitrouab868312009-01-10 15:40:25 +00004994/* Mask to quickly check whether a C 'long' contains a
4995 non-ASCII, UTF8-encoded char. */
4996#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004997# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004998#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004999# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005000#else
5001# error C 'long' size should be either 4 or 8!
5002#endif
5003
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004static Py_ssize_t
5005ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005006{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005008 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005009
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005010 /*
5011 * Issue #17237: m68k is a bit different from most architectures in
5012 * that objects do not use "natural alignment" - for example, int and
5013 * long are only aligned at 2-byte boundaries. Therefore the assert()
5014 * won't work; also, tests have shown that skipping the "optimised
5015 * version" will even speed up m68k.
5016 */
5017#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005019 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
5020 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 /* Fast path, see in STRINGLIB(utf8_decode) for
5022 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005023 /* Help allocation */
5024 const char *_p = p;
5025 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005026 while (_p < aligned_end) {
5027 unsigned long value = *(const unsigned long *) _p;
5028 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 *((unsigned long *)q) = value;
5031 _p += SIZEOF_LONG;
5032 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005033 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005034 p = _p;
5035 while (p < end) {
5036 if ((unsigned char)*p & 0x80)
5037 break;
5038 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005040 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005042#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005043#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005044 while (p < end) {
5045 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5046 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005047 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005048 /* Help allocation */
5049 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06005051 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005052 if (value & ASCII_CHAR_MASK)
5053 break;
5054 _p += SIZEOF_LONG;
5055 }
5056 p = _p;
5057 if (_p == end)
5058 break;
5059 }
5060 if ((unsigned char)*p & 0x80)
5061 break;
5062 ++p;
5063 }
5064 memcpy(dest, start, p - start);
5065 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066}
Antoine Pitrouab868312009-01-10 15:40:25 +00005067
Victor Stinner709d23d2019-05-02 14:56:30 -04005068static PyObject *
5069unicode_decode_utf8(const char *s, Py_ssize_t size,
5070 _Py_error_handler error_handler, const char *errors,
5071 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005072{
Victor Stinner785938e2011-12-11 20:09:03 +01005073 if (size == 0) {
5074 if (consumed)
5075 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005076 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005077 }
5078
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5080 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005081 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005083 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005085 }
5086
Inada Naoki770847a2019-06-24 12:30:24 +09005087 const char *starts = s;
5088 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005089
Inada Naoki770847a2019-06-24 12:30:24 +09005090 // fast path: try ASCII string.
5091 PyObject *u = PyUnicode_New(size, 127);
5092 if (u == NULL) {
5093 return NULL;
5094 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005095 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005096 if (s == end) {
5097 return u;
5098 }
5099
5100 // Use _PyUnicodeWriter after fast path is failed.
5101 _PyUnicodeWriter writer;
5102 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5103 writer.pos = s - starts;
5104
5105 Py_ssize_t startinpos, endinpos;
5106 const char *errmsg = "";
5107 PyObject *error_handler_obj = NULL;
5108 PyObject *exc = NULL;
5109
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 while (s < end) {
5111 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005112 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005113
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005115 if (PyUnicode_IS_ASCII(writer.buffer))
5116 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005118 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005120 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005121 } else {
5122 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005123 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 }
5125
5126 switch (ch) {
5127 case 0:
5128 if (s == end || consumed)
5129 goto End;
5130 errmsg = "unexpected end of data";
5131 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005132 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 break;
5134 case 1:
5135 errmsg = "invalid start byte";
5136 startinpos = s - starts;
5137 endinpos = startinpos + 1;
5138 break;
5139 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005140 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5141 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5142 {
5143 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005144 goto End;
5145 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005146 /* fall through */
5147 case 3:
5148 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 errmsg = "invalid continuation byte";
5150 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005151 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005152 break;
5153 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005154 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 goto onError;
5156 continue;
5157 }
5158
Victor Stinner1d65d912015-10-05 13:43:50 +02005159 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005160 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005161
5162 switch (error_handler) {
5163 case _Py_ERROR_IGNORE:
5164 s += (endinpos - startinpos);
5165 break;
5166
5167 case _Py_ERROR_REPLACE:
5168 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5169 goto onError;
5170 s += (endinpos - startinpos);
5171 break;
5172
5173 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005174 {
5175 Py_ssize_t i;
5176
Victor Stinner1d65d912015-10-05 13:43:50 +02005177 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5178 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005179 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005180 ch = (Py_UCS4)(unsigned char)(starts[i]);
5181 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5182 ch + 0xdc00);
5183 writer.pos++;
5184 }
5185 s += (endinpos - startinpos);
5186 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005187 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005188
5189 default:
5190 if (unicode_decode_call_errorhandler_writer(
5191 errors, &error_handler_obj,
5192 "utf-8", errmsg,
5193 &starts, &end, &startinpos, &endinpos, &exc, &s,
5194 &writer))
5195 goto onError;
5196 }
Victor Stinner785938e2011-12-11 20:09:03 +01005197 }
5198
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005199End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005200 if (consumed)
5201 *consumed = s - starts;
5202
Victor Stinner1d65d912015-10-05 13:43:50 +02005203 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005204 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005205 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005206
5207onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005208 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005210 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005211 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005212}
5213
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005214
Victor Stinner709d23d2019-05-02 14:56:30 -04005215PyObject *
5216PyUnicode_DecodeUTF8Stateful(const char *s,
5217 Py_ssize_t size,
5218 const char *errors,
5219 Py_ssize_t *consumed)
5220{
5221 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5222}
5223
5224
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005225/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5226 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005227
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005228 On success, write a pointer to a newly allocated wide character string into
5229 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5230 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005231
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005232 On memory allocation failure, return -1.
5233
5234 On decoding error (if surrogateescape is zero), return -2. If wlen is
5235 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5236 is not NULL, write the decoding error message into *reason. */
5237int
5238_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005239 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005240{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005241 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005242 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005243 wchar_t *unicode;
5244 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005245
Victor Stinner3d4226a2018-08-29 22:21:32 +02005246 int surrogateescape = 0;
5247 int surrogatepass = 0;
5248 switch (errors)
5249 {
5250 case _Py_ERROR_STRICT:
5251 break;
5252 case _Py_ERROR_SURROGATEESCAPE:
5253 surrogateescape = 1;
5254 break;
5255 case _Py_ERROR_SURROGATEPASS:
5256 surrogatepass = 1;
5257 break;
5258 default:
5259 return -3;
5260 }
5261
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005262 /* Note: size will always be longer than the resulting Unicode
5263 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005264 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005265 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005266 }
5267
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005268 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005269 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005271 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005272
5273 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005274 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005275 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005276 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005277 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005278#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005279 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005280#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005281 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005282#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005283 if (ch > 0xFF) {
5284#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005285 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005286#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005287 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005288 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005289 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5290 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5291#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005292 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005293 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005294 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005295 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005296 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005297
5298 if (surrogateescape) {
5299 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5300 }
5301 else {
5302 /* Is it a valid three-byte code? */
5303 if (surrogatepass
5304 && (e - s) >= 3
5305 && (s[0] & 0xf0) == 0xe0
5306 && (s[1] & 0xc0) == 0x80
5307 && (s[2] & 0xc0) == 0x80)
5308 {
5309 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5310 s += 3;
5311 unicode[outpos++] = ch;
5312 }
5313 else {
5314 PyMem_RawFree(unicode );
5315 if (reason != NULL) {
5316 switch (ch) {
5317 case 0:
5318 *reason = "unexpected end of data";
5319 break;
5320 case 1:
5321 *reason = "invalid start byte";
5322 break;
5323 /* 2, 3, 4 */
5324 default:
5325 *reason = "invalid continuation byte";
5326 break;
5327 }
5328 }
5329 if (wlen != NULL) {
5330 *wlen = s - orig_s;
5331 }
5332 return -2;
5333 }
5334 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005335 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005336 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005337 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005338 if (wlen) {
5339 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005340 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005341 *wstr = unicode;
5342 return 0;
5343}
5344
Victor Stinner5f9cf232019-03-19 01:46:25 +01005345
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005346wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005347_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5348 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005349{
5350 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005351 int res = _Py_DecodeUTF8Ex(arg, arglen,
5352 &wstr, wlen,
5353 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005354 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005355 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5356 assert(res != -3);
5357 if (wlen) {
5358 *wlen = (size_t)res;
5359 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 return NULL;
5361 }
5362 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005363}
5364
Antoine Pitrouab868312009-01-10 15:40:25 +00005365
Victor Stinnere47e6982017-12-21 15:45:16 +01005366/* UTF-8 encoder using the surrogateescape error handler .
5367
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005368 On success, return 0 and write the newly allocated character string (use
5369 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005370
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005371 On encoding failure, return -2 and write the position of the invalid
5372 surrogate character into *error_pos (if error_pos is set) and the decoding
5373 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005374
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005375 On memory allocation failure, return -1. */
5376int
5377_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005378 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005379{
5380 const Py_ssize_t max_char_size = 4;
5381 Py_ssize_t len = wcslen(text);
5382
5383 assert(len >= 0);
5384
Victor Stinner3d4226a2018-08-29 22:21:32 +02005385 int surrogateescape = 0;
5386 int surrogatepass = 0;
5387 switch (errors)
5388 {
5389 case _Py_ERROR_STRICT:
5390 break;
5391 case _Py_ERROR_SURROGATEESCAPE:
5392 surrogateescape = 1;
5393 break;
5394 case _Py_ERROR_SURROGATEPASS:
5395 surrogatepass = 1;
5396 break;
5397 default:
5398 return -3;
5399 }
5400
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005401 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5402 return -1;
5403 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005404 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005405 if (raw_malloc) {
5406 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005407 }
5408 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005409 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005410 }
5411 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005412 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005413 }
5414
5415 char *p = bytes;
5416 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005417 for (i = 0; i < len; ) {
5418 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005419 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005420 i++;
5421#if Py_UNICODE_SIZE == 2
5422 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5423 && i < len
5424 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5425 {
5426 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5427 i++;
5428 }
5429#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005430
5431 if (ch < 0x80) {
5432 /* Encode ASCII */
5433 *p++ = (char) ch;
5434
5435 }
5436 else if (ch < 0x0800) {
5437 /* Encode Latin-1 */
5438 *p++ = (char)(0xc0 | (ch >> 6));
5439 *p++ = (char)(0x80 | (ch & 0x3f));
5440 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005441 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005442 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005443 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005444 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005445 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005446 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005447 if (reason != NULL) {
5448 *reason = "encoding error";
5449 }
5450 if (raw_malloc) {
5451 PyMem_RawFree(bytes);
5452 }
5453 else {
5454 PyMem_Free(bytes);
5455 }
5456 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005457 }
5458 *p++ = (char)(ch & 0xff);
5459 }
5460 else if (ch < 0x10000) {
5461 *p++ = (char)(0xe0 | (ch >> 12));
5462 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5463 *p++ = (char)(0x80 | (ch & 0x3f));
5464 }
5465 else { /* ch >= 0x10000 */
5466 assert(ch <= MAX_UNICODE);
5467 /* Encode UCS4 Unicode ordinals */
5468 *p++ = (char)(0xf0 | (ch >> 18));
5469 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5470 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5471 *p++ = (char)(0x80 | (ch & 0x3f));
5472 }
5473 }
5474 *p++ = '\0';
5475
5476 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005477 char *bytes2;
5478 if (raw_malloc) {
5479 bytes2 = PyMem_RawRealloc(bytes, final_size);
5480 }
5481 else {
5482 bytes2 = PyMem_Realloc(bytes, final_size);
5483 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005484 if (bytes2 == NULL) {
5485 if (error_pos != NULL) {
5486 *error_pos = (size_t)-1;
5487 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005488 if (raw_malloc) {
5489 PyMem_RawFree(bytes);
5490 }
5491 else {
5492 PyMem_Free(bytes);
5493 }
5494 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005495 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005496 *str = bytes2;
5497 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005498}
5499
5500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501/* Primary internal function which creates utf8 encoded bytes objects.
5502
5503 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005504 and allocate exactly as much space needed at the end. Else allocate the
5505 maximum possible needed (4 result bytes per Unicode character), and return
5506 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005507*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005508static PyObject *
5509unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5510 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005512 if (!PyUnicode_Check(unicode)) {
5513 PyErr_BadArgument();
5514 return NULL;
5515 }
5516
5517 if (PyUnicode_READY(unicode) == -1)
5518 return NULL;
5519
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005520 if (PyUnicode_UTF8(unicode))
5521 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5522 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005523
Inada Naoki02a4d572020-02-27 13:48:59 +09005524 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005525 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005526 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5527
5528 _PyBytesWriter writer;
5529 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530
Benjamin Petersonead6b532011-12-20 17:23:42 -06005531 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005532 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005533 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005534 case PyUnicode_1BYTE_KIND:
5535 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5536 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005537 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5538 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005539 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005540 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5541 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005542 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005543 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5544 break;
Tim Peters602f7402002-04-27 18:03:26 +00005545 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005546
5547 if (end == NULL) {
5548 _PyBytesWriter_Dealloc(&writer);
5549 return NULL;
5550 }
5551 return _PyBytesWriter_Finish(&writer, end);
5552}
5553
5554static int
5555unicode_fill_utf8(PyObject *unicode)
5556{
5557 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5558 assert(!PyUnicode_IS_ASCII(unicode));
5559
5560 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005561 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005562 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5563
5564 _PyBytesWriter writer;
5565 char *end;
5566
5567 switch (kind) {
5568 default:
5569 Py_UNREACHABLE();
5570 case PyUnicode_1BYTE_KIND:
5571 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5572 _Py_ERROR_STRICT, NULL);
5573 break;
5574 case PyUnicode_2BYTE_KIND:
5575 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5576 _Py_ERROR_STRICT, NULL);
5577 break;
5578 case PyUnicode_4BYTE_KIND:
5579 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5580 _Py_ERROR_STRICT, NULL);
5581 break;
5582 }
5583 if (end == NULL) {
5584 _PyBytesWriter_Dealloc(&writer);
5585 return -1;
5586 }
5587
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005588 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005589 PyBytes_AS_STRING(writer.buffer);
5590 Py_ssize_t len = end - start;
5591
5592 char *cache = PyObject_MALLOC(len + 1);
5593 if (cache == NULL) {
5594 _PyBytesWriter_Dealloc(&writer);
5595 PyErr_NoMemory();
5596 return -1;
5597 }
5598 _PyUnicode_UTF8(unicode) = cache;
5599 _PyUnicode_UTF8_LENGTH(unicode) = len;
5600 memcpy(cache, start, len);
5601 cache[len] = '\0';
5602 _PyBytesWriter_Dealloc(&writer);
5603 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604}
5605
Alexander Belopolsky40018472011-02-26 01:02:56 +00005606PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005607_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5608{
5609 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5610}
5611
5612
5613PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005614PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5615 Py_ssize_t size,
5616 const char *errors)
5617{
5618 PyObject *v, *unicode;
5619
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005620 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 if (unicode == NULL)
5622 return NULL;
5623 v = _PyUnicode_AsUTF8String(unicode, errors);
5624 Py_DECREF(unicode);
5625 return v;
5626}
5627
5628PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005629PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632}
5633
Walter Dörwald41980ca2007-08-16 21:55:45 +00005634/* --- UTF-32 Codec ------------------------------------------------------- */
5635
5636PyObject *
5637PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 Py_ssize_t size,
5639 const char *errors,
5640 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005641{
5642 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5643}
5644
5645PyObject *
5646PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 Py_ssize_t size,
5648 const char *errors,
5649 int *byteorder,
5650 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005651{
5652 const char *starts = s;
5653 Py_ssize_t startinpos;
5654 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005655 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005656 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005657 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005658 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005659 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005660 PyObject *errorHandler = NULL;
5661 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005662
Andy Lestere6be9b52020-02-11 20:28:35 -06005663 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005664 e = q + size;
5665
5666 if (byteorder)
5667 bo = *byteorder;
5668
5669 /* Check for BOM marks (U+FEFF) in the input and adjust current
5670 byte order setting accordingly. In native mode, the leading BOM
5671 mark is skipped, in all other modes, it is copied to the output
5672 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005673 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005674 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005675 if (bom == 0x0000FEFF) {
5676 bo = -1;
5677 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005679 else if (bom == 0xFFFE0000) {
5680 bo = 1;
5681 q += 4;
5682 }
5683 if (byteorder)
5684 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005685 }
5686
Victor Stinnere64322e2012-10-30 23:12:47 +01005687 if (q == e) {
5688 if (consumed)
5689 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005690 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005691 }
5692
Victor Stinnere64322e2012-10-30 23:12:47 +01005693#ifdef WORDS_BIGENDIAN
5694 le = bo < 0;
5695#else
5696 le = bo <= 0;
5697#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005698 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005699
Victor Stinner8f674cc2013-04-17 23:02:17 +02005700 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005701 writer.min_length = (e - q + 3) / 4;
5702 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005703 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005704
Victor Stinnere64322e2012-10-30 23:12:47 +01005705 while (1) {
5706 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005707 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005708
Victor Stinnere64322e2012-10-30 23:12:47 +01005709 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 enum PyUnicode_Kind kind = writer.kind;
5711 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005712 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005713 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005714 if (le) {
5715 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005716 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005717 if (ch > maxch)
5718 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 if (kind != PyUnicode_1BYTE_KIND &&
5720 Py_UNICODE_IS_SURROGATE(ch))
5721 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005722 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005723 q += 4;
5724 } while (q <= last);
5725 }
5726 else {
5727 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005728 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005729 if (ch > maxch)
5730 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 if (kind != PyUnicode_1BYTE_KIND &&
5732 Py_UNICODE_IS_SURROGATE(ch))
5733 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005734 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005735 q += 4;
5736 } while (q <= last);
5737 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005738 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005739 }
5740
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005741 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005742 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 startinpos = ((const char *)q) - starts;
5744 endinpos = startinpos + 4;
5745 }
5746 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005747 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005749 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005751 startinpos = ((const char *)q) - starts;
5752 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005754 else {
5755 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005756 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005757 goto onError;
5758 q += 4;
5759 continue;
5760 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005761 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005762 startinpos = ((const char *)q) - starts;
5763 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005765
5766 /* The remaining input chars are ignored if the callback
5767 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005774 }
5775
Walter Dörwald41980ca2007-08-16 21:55:45 +00005776 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005778
Walter Dörwald41980ca2007-08-16 21:55:45 +00005779 Py_XDECREF(errorHandler);
5780 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005781 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005782
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005784 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005785 Py_XDECREF(errorHandler);
5786 Py_XDECREF(exc);
5787 return NULL;
5788}
5789
5790PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005791_PyUnicode_EncodeUTF32(PyObject *str,
5792 const char *errors,
5793 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005794{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005795 enum PyUnicode_Kind kind;
5796 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005797 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005798 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005799 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005800#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005801 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005802#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005803 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005804#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005805 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005806 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005807 PyObject *errorHandler = NULL;
5808 PyObject *exc = NULL;
5809 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005810
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005811 if (!PyUnicode_Check(str)) {
5812 PyErr_BadArgument();
5813 return NULL;
5814 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005815 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005816 return NULL;
5817 kind = PyUnicode_KIND(str);
5818 data = PyUnicode_DATA(str);
5819 len = PyUnicode_GET_LENGTH(str);
5820
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005821 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005822 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005823 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005824 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005825 if (v == NULL)
5826 return NULL;
5827
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005828 /* output buffer is 4-bytes aligned */
5829 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005830 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005831 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005832 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005833 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005834 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005835
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005836 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005837 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005838 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005840 else
5841 encoding = "utf-32";
5842
5843 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005844 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5845 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005846 }
5847
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005848 pos = 0;
5849 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005850 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005851
5852 if (kind == PyUnicode_2BYTE_KIND) {
5853 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5854 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005855 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005856 else {
5857 assert(kind == PyUnicode_4BYTE_KIND);
5858 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5859 &out, native_ordering);
5860 }
5861 if (pos == len)
5862 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005863
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005864 rep = unicode_encode_call_errorhandler(
5865 errors, &errorHandler,
5866 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005867 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005868 if (!rep)
5869 goto error;
5870
5871 if (PyBytes_Check(rep)) {
5872 repsize = PyBytes_GET_SIZE(rep);
5873 if (repsize & 3) {
5874 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005875 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005876 "surrogates not allowed");
5877 goto error;
5878 }
5879 moreunits = repsize / 4;
5880 }
5881 else {
5882 assert(PyUnicode_Check(rep));
5883 if (PyUnicode_READY(rep) < 0)
5884 goto error;
5885 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5886 if (!PyUnicode_IS_ASCII(rep)) {
5887 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005888 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005889 "surrogates not allowed");
5890 goto error;
5891 }
5892 }
5893
5894 /* four bytes are reserved for each surrogate */
5895 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005896 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005897 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005898 /* integer overflow */
5899 PyErr_NoMemory();
5900 goto error;
5901 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005902 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005903 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005904 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005905 }
5906
5907 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005908 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005909 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005910 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005911 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005912 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5913 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005914 }
5915
5916 Py_CLEAR(rep);
5917 }
5918
5919 /* Cut back to size actually needed. This is necessary for, for example,
5920 encoding of a string containing isolated surrogates and the 'ignore'
5921 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005922 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005923 if (nsize != PyBytes_GET_SIZE(v))
5924 _PyBytes_Resize(&v, nsize);
5925 Py_XDECREF(errorHandler);
5926 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005927 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005928 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005929 error:
5930 Py_XDECREF(rep);
5931 Py_XDECREF(errorHandler);
5932 Py_XDECREF(exc);
5933 Py_XDECREF(v);
5934 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005935}
5936
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005938PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5939 Py_ssize_t size,
5940 const char *errors,
5941 int byteorder)
5942{
5943 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005944 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005945 if (tmp == NULL)
5946 return NULL;
5947 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5948 Py_DECREF(tmp);
5949 return result;
5950}
5951
5952PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005953PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005954{
Victor Stinnerb960b342011-11-20 19:12:52 +01005955 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005956}
5957
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958/* --- UTF-16 Codec ------------------------------------------------------- */
5959
Tim Peters772747b2001-08-09 22:21:55 +00005960PyObject *
5961PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 Py_ssize_t size,
5963 const char *errors,
5964 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
Walter Dörwald69652032004-09-07 20:24:22 +00005966 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5967}
5968
5969PyObject *
5970PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 Py_ssize_t size,
5972 const char *errors,
5973 int *byteorder,
5974 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005975{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005977 Py_ssize_t startinpos;
5978 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005979 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005980 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005981 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005982 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005983 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984 PyObject *errorHandler = NULL;
5985 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005986 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Andy Lestere6be9b52020-02-11 20:28:35 -06005988 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005989 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
5991 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005992 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005994 /* Check for BOM marks (U+FEFF) in the input and adjust current
5995 byte order setting accordingly. In native mode, the leading BOM
5996 mark is skipped, in all other modes, it is copied to the output
5997 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005998 if (bo == 0 && size >= 2) {
5999 const Py_UCS4 bom = (q[1] << 8) | q[0];
6000 if (bom == 0xFEFF) {
6001 q += 2;
6002 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006004 else if (bom == 0xFFFE) {
6005 q += 2;
6006 bo = 1;
6007 }
6008 if (byteorder)
6009 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
Antoine Pitrou63065d72012-05-15 23:48:04 +02006012 if (q == e) {
6013 if (consumed)
6014 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006015 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006016 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006017
Christian Heimes743e0cd2012-10-17 23:52:17 +02006018#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006019 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006020 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006021#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006022 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006023 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006024#endif
Tim Peters772747b2001-08-09 22:21:55 +00006025
Antoine Pitrou63065d72012-05-15 23:48:04 +02006026 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006027 character count normally. Error handler will take care of
6028 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006029 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006030 writer.min_length = (e - q + 1) / 2;
6031 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006033
Antoine Pitrou63065d72012-05-15 23:48:04 +02006034 while (1) {
6035 Py_UCS4 ch = 0;
6036 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006037 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006038 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006039 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006040 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006041 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006042 native_ordering);
6043 else
6044 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006045 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006046 native_ordering);
6047 } else if (kind == PyUnicode_2BYTE_KIND) {
6048 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006049 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006050 native_ordering);
6051 } else {
6052 assert(kind == PyUnicode_4BYTE_KIND);
6053 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006054 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006055 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006056 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058
Antoine Pitrou63065d72012-05-15 23:48:04 +02006059 switch (ch)
6060 {
6061 case 0:
6062 /* remaining byte at the end? (size should be even) */
6063 if (q == e || consumed)
6064 goto End;
6065 errmsg = "truncated data";
6066 startinpos = ((const char *)q) - starts;
6067 endinpos = ((const char *)e) - starts;
6068 break;
6069 /* The remaining input chars are ignored if the callback
6070 chooses to skip the input */
6071 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006072 q -= 2;
6073 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006074 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006075 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006076 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 endinpos = ((const char *)e) - starts;
6078 break;
6079 case 2:
6080 errmsg = "illegal encoding";
6081 startinpos = ((const char *)q) - 2 - starts;
6082 endinpos = startinpos + 2;
6083 break;
6084 case 3:
6085 errmsg = "illegal UTF-16 surrogate";
6086 startinpos = ((const char *)q) - 4 - starts;
6087 endinpos = startinpos + 2;
6088 break;
6089 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006090 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006091 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 continue;
6093 }
6094
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006095 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006096 errors,
6097 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006098 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006099 &starts,
6100 (const char **)&e,
6101 &startinpos,
6102 &endinpos,
6103 &exc,
6104 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006105 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 }
6108
Antoine Pitrou63065d72012-05-15 23:48:04 +02006109End:
Walter Dörwald69652032004-09-07 20:24:22 +00006110 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 Py_XDECREF(errorHandler);
6114 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006115 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 Py_XDECREF(errorHandler);
6120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 return NULL;
6122}
6123
Tim Peters772747b2001-08-09 22:21:55 +00006124PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006125_PyUnicode_EncodeUTF16(PyObject *str,
6126 const char *errors,
6127 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006129 enum PyUnicode_Kind kind;
6130 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006132 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006133 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006134 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006135#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006136 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006137#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006138 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006139#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006140 const char *encoding;
6141 Py_ssize_t nsize, pos;
6142 PyObject *errorHandler = NULL;
6143 PyObject *exc = NULL;
6144 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006145
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 if (!PyUnicode_Check(str)) {
6147 PyErr_BadArgument();
6148 return NULL;
6149 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006150 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 return NULL;
6152 kind = PyUnicode_KIND(str);
6153 data = PyUnicode_DATA(str);
6154 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006155
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006157 if (kind == PyUnicode_4BYTE_KIND) {
6158 const Py_UCS4 *in = (const Py_UCS4 *)data;
6159 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006160 while (in < end) {
6161 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006163 }
6164 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006165 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006166 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006168 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006169 nsize = len + pairs + (byteorder == 0);
6170 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006171 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006175 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006176 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006177 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006178 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006179 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006180 }
6181 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006182 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006183 }
Tim Peters772747b2001-08-09 22:21:55 +00006184
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006185 if (kind == PyUnicode_1BYTE_KIND) {
6186 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6187 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006188 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006189
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006190 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006191 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006192 }
6193 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006194 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006195 }
6196 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006197 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006198 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006199
6200 pos = 0;
6201 while (pos < len) {
6202 Py_ssize_t repsize, moreunits;
6203
6204 if (kind == PyUnicode_2BYTE_KIND) {
6205 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6206 &out, native_ordering);
6207 }
6208 else {
6209 assert(kind == PyUnicode_4BYTE_KIND);
6210 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6211 &out, native_ordering);
6212 }
6213 if (pos == len)
6214 break;
6215
6216 rep = unicode_encode_call_errorhandler(
6217 errors, &errorHandler,
6218 encoding, "surrogates not allowed",
6219 str, &exc, pos, pos + 1, &pos);
6220 if (!rep)
6221 goto error;
6222
6223 if (PyBytes_Check(rep)) {
6224 repsize = PyBytes_GET_SIZE(rep);
6225 if (repsize & 1) {
6226 raise_encode_exception(&exc, encoding,
6227 str, pos - 1, pos,
6228 "surrogates not allowed");
6229 goto error;
6230 }
6231 moreunits = repsize / 2;
6232 }
6233 else {
6234 assert(PyUnicode_Check(rep));
6235 if (PyUnicode_READY(rep) < 0)
6236 goto error;
6237 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6238 if (!PyUnicode_IS_ASCII(rep)) {
6239 raise_encode_exception(&exc, encoding,
6240 str, pos - 1, pos,
6241 "surrogates not allowed");
6242 goto error;
6243 }
6244 }
6245
6246 /* two bytes are reserved for each surrogate */
6247 if (moreunits > 1) {
6248 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006249 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006250 /* integer overflow */
6251 PyErr_NoMemory();
6252 goto error;
6253 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006254 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006255 goto error;
6256 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6257 }
6258
6259 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006260 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006261 out += moreunits;
6262 } else /* rep is unicode */ {
6263 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6264 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6265 &out, native_ordering);
6266 }
6267
6268 Py_CLEAR(rep);
6269 }
6270
6271 /* Cut back to size actually needed. This is necessary for, for example,
6272 encoding of a string containing isolated surrogates and the 'ignore' handler
6273 is used. */
6274 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6275 if (nsize != PyBytes_GET_SIZE(v))
6276 _PyBytes_Resize(&v, nsize);
6277 Py_XDECREF(errorHandler);
6278 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006279 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006280 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006281 error:
6282 Py_XDECREF(rep);
6283 Py_XDECREF(errorHandler);
6284 Py_XDECREF(exc);
6285 Py_XDECREF(v);
6286 return NULL;
6287#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288}
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006291PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6292 Py_ssize_t size,
6293 const char *errors,
6294 int byteorder)
6295{
6296 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006297 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006298 if (tmp == NULL)
6299 return NULL;
6300 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6301 Py_DECREF(tmp);
6302 return result;
6303}
6304
6305PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006308 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309}
6310
6311/* --- Unicode Escape Codec ----------------------------------------------- */
6312
Fredrik Lundh06d12682001-01-24 07:59:11 +00006313static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006314
Alexander Belopolsky40018472011-02-26 01:02:56 +00006315PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006316_PyUnicode_DecodeUnicodeEscape(const char *s,
6317 Py_ssize_t size,
6318 const char *errors,
6319 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006321 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006322 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324 PyObject *errorHandler = NULL;
6325 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006326
Eric V. Smith42454af2016-10-31 09:22:08 -04006327 // so we can remember if we've seen an invalid escape char or not
6328 *first_invalid_escape = NULL;
6329
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006331 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 }
6333 /* Escaped strings will always be longer than the resulting
6334 Unicode string, so we start with size here and then reduce the
6335 length after conversion to the true value.
6336 (but if the error callback returns a long replacement string
6337 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006338 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 writer.min_length = size;
6340 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6341 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006342 }
6343
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 end = s + size;
6345 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 unsigned char c = (unsigned char) *s++;
6347 Py_UCS4 ch;
6348 int count;
6349 Py_ssize_t startinpos;
6350 Py_ssize_t endinpos;
6351 const char *message;
6352
6353#define WRITE_ASCII_CHAR(ch) \
6354 do { \
6355 assert(ch <= 127); \
6356 assert(writer.pos < writer.size); \
6357 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6358 } while(0)
6359
6360#define WRITE_CHAR(ch) \
6361 do { \
6362 if (ch <= writer.maxchar) { \
6363 assert(writer.pos < writer.size); \
6364 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6365 } \
6366 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6367 goto onError; \
6368 } \
6369 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370
6371 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 if (c != '\\') {
6373 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 continue;
6375 }
6376
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 if (s >= end) {
6380 message = "\\ at end of string";
6381 goto error;
6382 }
6383 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006384
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006386 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 case '\n': continue;
6390 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6391 case '\'': WRITE_ASCII_CHAR('\''); continue;
6392 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6393 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006394 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6396 case 't': WRITE_ASCII_CHAR('\t'); continue;
6397 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6398 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006399 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006401 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 case '0': case '1': case '2': case '3':
6406 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006408 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 ch = (ch<<3) + *s++ - '0';
6410 if (s < end && '0' <= *s && *s <= '7') {
6411 ch = (ch<<3) + *s++ - '0';
6412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 WRITE_CHAR(ch);
6415 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 /* hex escapes */
6418 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006421 message = "truncated \\xXX escape";
6422 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006427 message = "truncated \\uXXXX escape";
6428 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006431 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006433 message = "truncated \\UXXXXXXXX escape";
6434 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006436 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 ch <<= 4;
6438 if (c >= '0' && c <= '9') {
6439 ch += c - '0';
6440 }
6441 else if (c >= 'a' && c <= 'f') {
6442 ch += c - ('a' - 10);
6443 }
6444 else if (c >= 'A' && c <= 'F') {
6445 ch += c - ('A' - 10);
6446 }
6447 else {
6448 break;
6449 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006450 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006452 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 }
6454
6455 /* when we get here, ch is a 32-bit unicode character */
6456 if (ch > MAX_UNICODE) {
6457 message = "illegal Unicode character";
6458 goto error;
6459 }
6460
6461 WRITE_CHAR(ch);
6462 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006463
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006465 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006466 if (ucnhash_CAPI == NULL) {
6467 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006468 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6469 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 if (ucnhash_CAPI == NULL) {
6471 PyErr_SetString(
6472 PyExc_UnicodeError,
6473 "\\N escapes not supported (can't load unicodedata module)"
6474 );
6475 goto onError;
6476 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006477 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006478
6479 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006480 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 const char *start = ++s;
6482 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006483 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006485 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 namelen = s - start;
6487 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006488 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006489 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 ch = 0xffffffff; /* in case 'getcode' messes up */
6491 if (namelen <= INT_MAX &&
6492 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6493 &ch, 0)) {
6494 assert(ch <= MAX_UNICODE);
6495 WRITE_CHAR(ch);
6496 continue;
6497 }
6498 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006499 }
6500 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006501 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006502
6503 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006504 if (*first_invalid_escape == NULL) {
6505 *first_invalid_escape = s-1; /* Back up one char, since we've
6506 already incremented s. */
6507 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 WRITE_ASCII_CHAR('\\');
6509 WRITE_CHAR(c);
6510 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006512
6513 error:
6514 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006516 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006517 errors, &errorHandler,
6518 "unicodeescape", message,
6519 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006521 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006523 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006524
6525#undef WRITE_ASCII_CHAR
6526#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006528
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006529 Py_XDECREF(errorHandler);
6530 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006531 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006532
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006534 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006535 Py_XDECREF(errorHandler);
6536 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 return NULL;
6538}
6539
Eric V. Smith42454af2016-10-31 09:22:08 -04006540PyObject *
6541PyUnicode_DecodeUnicodeEscape(const char *s,
6542 Py_ssize_t size,
6543 const char *errors)
6544{
6545 const char *first_invalid_escape;
6546 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6547 &first_invalid_escape);
6548 if (result == NULL)
6549 return NULL;
6550 if (first_invalid_escape != NULL) {
6551 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6552 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006553 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006554 Py_DECREF(result);
6555 return NULL;
6556 }
6557 }
6558 return result;
6559}
6560
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006561/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
Alexander Belopolsky40018472011-02-26 01:02:56 +00006563PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006564PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006566 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006567 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006570 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Ezio Melottie7f90372012-10-05 03:33:31 +03006573 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006574 escape.
6575
Ezio Melottie7f90372012-10-05 03:33:31 +03006576 For UCS1 strings it's '\xxx', 4 bytes per source character.
6577 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6578 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006579 */
6580
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006581 if (!PyUnicode_Check(unicode)) {
6582 PyErr_BadArgument();
6583 return NULL;
6584 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006586 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006587 }
Victor Stinner358af132015-10-12 22:36:57 +02006588
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006589 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006590 if (len == 0) {
6591 return PyBytes_FromStringAndSize(NULL, 0);
6592 }
6593
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006594 kind = PyUnicode_KIND(unicode);
6595 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006596 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6597 bytes, and 1 byte characters 4. */
6598 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006599 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006600 return PyErr_NoMemory();
6601 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006602 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006603 if (repr == NULL) {
6604 return NULL;
6605 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006606
Victor Stinner62ec3312016-09-06 17:04:34 -07006607 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006608 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006609 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006610
Victor Stinner62ec3312016-09-06 17:04:34 -07006611 /* U+0000-U+00ff range */
6612 if (ch < 0x100) {
6613 if (ch >= ' ' && ch < 127) {
6614 if (ch != '\\') {
6615 /* Copy printable US ASCII as-is */
6616 *p++ = (char) ch;
6617 }
6618 /* Escape backslashes */
6619 else {
6620 *p++ = '\\';
6621 *p++ = '\\';
6622 }
6623 }
Victor Stinner358af132015-10-12 22:36:57 +02006624
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 /* Map special whitespace to '\t', \n', '\r' */
6626 else if (ch == '\t') {
6627 *p++ = '\\';
6628 *p++ = 't';
6629 }
6630 else if (ch == '\n') {
6631 *p++ = '\\';
6632 *p++ = 'n';
6633 }
6634 else if (ch == '\r') {
6635 *p++ = '\\';
6636 *p++ = 'r';
6637 }
6638
6639 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6640 else {
6641 *p++ = '\\';
6642 *p++ = 'x';
6643 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6644 *p++ = Py_hexdigits[ch & 0x000F];
6645 }
Tim Petersced69f82003-09-16 20:30:58 +00006646 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006647 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006648 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 *p++ = '\\';
6650 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006651 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6652 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6653 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6654 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006656 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6657 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006658
Victor Stinner62ec3312016-09-06 17:04:34 -07006659 /* Make sure that the first two digits are zero */
6660 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006661 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006662 *p++ = 'U';
6663 *p++ = '0';
6664 *p++ = '0';
6665 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6666 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6667 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6668 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6669 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6670 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673
Victor Stinner62ec3312016-09-06 17:04:34 -07006674 assert(p - PyBytes_AS_STRING(repr) > 0);
6675 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6676 return NULL;
6677 }
6678 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679}
6680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006682PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6683 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006685 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006686 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006687 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006689 }
6690
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006691 result = PyUnicode_AsUnicodeEscapeString(tmp);
6692 Py_DECREF(tmp);
6693 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694}
6695
6696/* --- Raw Unicode Escape Codec ------------------------------------------- */
6697
Alexander Belopolsky40018472011-02-26 01:02:56 +00006698PyObject *
6699PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006700 Py_ssize_t size,
6701 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006704 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 PyObject *errorHandler = NULL;
6707 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006708
Victor Stinner62ec3312016-09-06 17:04:34 -07006709 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006710 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006711 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 /* Escaped strings will always be longer than the resulting
6714 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 length after conversion to the true value. (But decoding error
6716 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006717 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006718 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006719 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6720 goto onError;
6721 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006722
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 end = s + size;
6724 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006725 unsigned char c = (unsigned char) *s++;
6726 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006727 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006728 Py_ssize_t startinpos;
6729 Py_ssize_t endinpos;
6730 const char *message;
6731
6732#define WRITE_CHAR(ch) \
6733 do { \
6734 if (ch <= writer.maxchar) { \
6735 assert(writer.pos < writer.size); \
6736 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6737 } \
6738 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6739 goto onError; \
6740 } \
6741 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006744 if (c != '\\' || s >= end) {
6745 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006747 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006748
Victor Stinner62ec3312016-09-06 17:04:34 -07006749 c = (unsigned char) *s++;
6750 if (c == 'u') {
6751 count = 4;
6752 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006754 else if (c == 'U') {
6755 count = 8;
6756 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006757 }
6758 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006759 assert(writer.pos < writer.size);
6760 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6761 WRITE_CHAR(c);
6762 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006763 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006764 startinpos = s - starts - 2;
6765
6766 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6767 for (ch = 0; count && s < end; ++s, --count) {
6768 c = (unsigned char)*s;
6769 ch <<= 4;
6770 if (c >= '0' && c <= '9') {
6771 ch += c - '0';
6772 }
6773 else if (c >= 'a' && c <= 'f') {
6774 ch += c - ('a' - 10);
6775 }
6776 else if (c >= 'A' && c <= 'F') {
6777 ch += c - ('A' - 10);
6778 }
6779 else {
6780 break;
6781 }
6782 }
6783 if (!count) {
6784 if (ch <= MAX_UNICODE) {
6785 WRITE_CHAR(ch);
6786 continue;
6787 }
6788 message = "\\Uxxxxxxxx out of range";
6789 }
6790
6791 endinpos = s-starts;
6792 writer.min_length = end - s + writer.pos;
6793 if (unicode_decode_call_errorhandler_writer(
6794 errors, &errorHandler,
6795 "rawunicodeescape", message,
6796 &starts, &end, &startinpos, &endinpos, &exc, &s,
6797 &writer)) {
6798 goto onError;
6799 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006800 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006801
6802#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804 Py_XDECREF(errorHandler);
6805 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006806 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006807
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006809 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 Py_XDECREF(errorHandler);
6811 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006813
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814}
6815
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006816
Alexander Belopolsky40018472011-02-26 01:02:56 +00006817PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006818PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819{
Victor Stinner62ec3312016-09-06 17:04:34 -07006820 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006822 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006823 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006824 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006825 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006827 if (!PyUnicode_Check(unicode)) {
6828 PyErr_BadArgument();
6829 return NULL;
6830 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006831 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006832 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006833 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006834 kind = PyUnicode_KIND(unicode);
6835 data = PyUnicode_DATA(unicode);
6836 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006837 if (kind == PyUnicode_1BYTE_KIND) {
6838 return PyBytes_FromStringAndSize(data, len);
6839 }
Victor Stinner0e368262011-11-10 20:12:49 +01006840
Victor Stinner62ec3312016-09-06 17:04:34 -07006841 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6842 bytes, and 1 byte characters 4. */
6843 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006844
Victor Stinner62ec3312016-09-06 17:04:34 -07006845 if (len > PY_SSIZE_T_MAX / expandsize) {
6846 return PyErr_NoMemory();
6847 }
6848 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6849 if (repr == NULL) {
6850 return NULL;
6851 }
6852 if (len == 0) {
6853 return repr;
6854 }
6855
6856 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006857 for (pos = 0; pos < len; pos++) {
6858 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006859
Victor Stinner62ec3312016-09-06 17:04:34 -07006860 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6861 if (ch < 0x100) {
6862 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006863 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006864 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006865 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 *p++ = '\\';
6867 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006868 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6869 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6870 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6871 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006873 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6874 else {
6875 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6876 *p++ = '\\';
6877 *p++ = 'U';
6878 *p++ = '0';
6879 *p++ = '0';
6880 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6881 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6882 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6883 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6884 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6885 *p++ = Py_hexdigits[ch & 15];
6886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006888
Victor Stinner62ec3312016-09-06 17:04:34 -07006889 assert(p > PyBytes_AS_STRING(repr));
6890 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6891 return NULL;
6892 }
6893 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Alexander Belopolsky40018472011-02-26 01:02:56 +00006896PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006897PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6898 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006900 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006901 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006902 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006903 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006904 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6905 Py_DECREF(tmp);
6906 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
6909/* --- Latin-1 Codec ------------------------------------------------------ */
6910
Alexander Belopolsky40018472011-02-26 01:02:56 +00006911PyObject *
6912PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006913 Py_ssize_t size,
6914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006917 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006921static void
6922make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006923 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006924 PyObject *unicode,
6925 Py_ssize_t startpos, Py_ssize_t endpos,
6926 const char *reason)
6927{
6928 if (*exceptionObject == NULL) {
6929 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006931 encoding, unicode, startpos, endpos, reason);
6932 }
6933 else {
6934 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6935 goto onError;
6936 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6937 goto onError;
6938 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6939 goto onError;
6940 return;
6941 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006942 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006943 }
6944}
6945
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006947static void
6948raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006949 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006950 PyObject *unicode,
6951 Py_ssize_t startpos, Py_ssize_t endpos,
6952 const char *reason)
6953{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006954 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006955 encoding, unicode, startpos, endpos, reason);
6956 if (*exceptionObject != NULL)
6957 PyCodec_StrictErrors(*exceptionObject);
6958}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959
6960/* error handling callback helper:
6961 build arguments, call the callback and check the arguments,
6962 put the result into newpos and return the replacement string, which
6963 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006964static PyObject *
6965unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006966 PyObject **errorHandler,
6967 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006968 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006969 Py_ssize_t startpos, Py_ssize_t endpos,
6970 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006971{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006972 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006973 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006974 PyObject *restuple;
6975 PyObject *resunicode;
6976
6977 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006979 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006980 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006981 }
6982
Benjamin Petersonbac79492012-01-14 13:34:47 -05006983 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006984 return NULL;
6985 len = PyUnicode_GET_LENGTH(unicode);
6986
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006987 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006988 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006989 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991
Petr Viktorinffd97532020-02-11 17:46:57 +01006992 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006993 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006995 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006996 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 Py_DECREF(restuple);
6998 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006999 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007000 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007001 &resunicode, newpos)) {
7002 Py_DECREF(restuple);
7003 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007004 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007005 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7006 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7007 Py_DECREF(restuple);
7008 return NULL;
7009 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007011 *newpos = len + *newpos;
7012 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007013 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 Py_DECREF(restuple);
7015 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007016 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007017 Py_INCREF(resunicode);
7018 Py_DECREF(restuple);
7019 return resunicode;
7020}
7021
Alexander Belopolsky40018472011-02-26 01:02:56 +00007022static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007024 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007025 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007026{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007027 /* input state */
7028 Py_ssize_t pos=0, size;
7029 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007030 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007031 /* pointer into the output */
7032 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007033 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7034 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007035 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007036 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007037 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007038 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007039 /* output object */
7040 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007041
Benjamin Petersonbac79492012-01-14 13:34:47 -05007042 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007043 return NULL;
7044 size = PyUnicode_GET_LENGTH(unicode);
7045 kind = PyUnicode_KIND(unicode);
7046 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007047 /* allocate enough for a simple encoding without
7048 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007049 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007050 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007051
7052 _PyBytesWriter_Init(&writer);
7053 str = _PyBytesWriter_Alloc(&writer, size);
7054 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007055 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007058 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007061 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007063 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007064 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007065 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007067 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007069 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007070 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007072
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007073 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007075
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007076 /* Only overallocate the buffer if it's not the last write */
7077 writer.overallocate = (collend < size);
7078
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007080 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007081 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007082
7083 switch (error_handler) {
7084 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007085 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007087
7088 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007089 memset(str, '?', collend - collstart);
7090 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007091 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007092 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007093 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 break;
Victor Stinner50149202015-09-22 00:26:54 +02007095
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007096 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007097 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007098 writer.min_size -= (collend - collstart);
7099 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007100 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007101 if (str == NULL)
7102 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007103 pos = collend;
7104 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007105
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007106 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007107 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007108 writer.min_size -= (collend - collstart);
7109 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007110 unicode, collstart, collend);
7111 if (str == NULL)
7112 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007113 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 break;
Victor Stinner50149202015-09-22 00:26:54 +02007115
Victor Stinnerc3713e92015-09-29 12:32:13 +02007116 case _Py_ERROR_SURROGATEESCAPE:
7117 for (i = collstart; i < collend; ++i) {
7118 ch = PyUnicode_READ(kind, data, i);
7119 if (ch < 0xdc80 || 0xdcff < ch) {
7120 /* Not a UTF-8b surrogate */
7121 break;
7122 }
7123 *str++ = (char)(ch - 0xdc00);
7124 ++pos;
7125 }
7126 if (i >= collend)
7127 break;
7128 collstart = pos;
7129 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007130 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007131
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007133 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7134 encoding, reason, unicode, &exc,
7135 collstart, collend, &newpos);
7136 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007138
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007139 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007140 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007141
Victor Stinner6bd525b2015-10-09 13:10:05 +02007142 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007143 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007144 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007145 PyBytes_AS_STRING(rep),
7146 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007147 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007148 else {
7149 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007150
Victor Stinner6bd525b2015-10-09 13:10:05 +02007151 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007153
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007154 if (limit == 256 ?
7155 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7156 !PyUnicode_IS_ASCII(rep))
7157 {
7158 /* Not all characters are smaller than limit */
7159 raise_encode_exception(&exc, encoding, unicode,
7160 collstart, collend, reason);
7161 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007163 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7164 str = _PyBytesWriter_WriteBytes(&writer, str,
7165 PyUnicode_DATA(rep),
7166 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007168 if (str == NULL)
7169 goto onError;
7170
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007171 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007172 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007173 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007174
7175 /* If overallocation was disabled, ensure that it was the last
7176 write. Otherwise, we missed an optimization */
7177 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007178 }
7179 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007180
Victor Stinner50149202015-09-22 00:26:54 +02007181 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007183 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007184
7185 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007186 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007187 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007188 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007189 Py_XDECREF(exc);
7190 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191}
7192
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007193/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007194PyObject *
7195PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007196 Py_ssize_t size,
7197 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007199 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007200 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007201 if (unicode == NULL)
7202 return NULL;
7203 result = unicode_encode_ucs1(unicode, errors, 256);
7204 Py_DECREF(unicode);
7205 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206}
7207
Alexander Belopolsky40018472011-02-26 01:02:56 +00007208PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007209_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210{
7211 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 PyErr_BadArgument();
7213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007215 if (PyUnicode_READY(unicode) == -1)
7216 return NULL;
7217 /* Fast path: if it is a one-byte string, construct
7218 bytes object directly. */
7219 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7220 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7221 PyUnicode_GET_LENGTH(unicode));
7222 /* Non-Latin-1 characters present. Defer to above function to
7223 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007224 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007225}
7226
7227PyObject*
7228PyUnicode_AsLatin1String(PyObject *unicode)
7229{
7230 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231}
7232
7233/* --- 7-bit ASCII Codec -------------------------------------------------- */
7234
Alexander Belopolsky40018472011-02-26 01:02:56 +00007235PyObject *
7236PyUnicode_DecodeASCII(const char *s,
7237 Py_ssize_t size,
7238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007240 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007241 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007242 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007244 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007245
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007248
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007250 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007251 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253
Inada Naoki770847a2019-06-24 12:30:24 +09007254 // Shortcut for simple case
7255 PyObject *u = PyUnicode_New(size, 127);
7256 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007257 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007258 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007259 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007260 if (outpos == size) {
7261 return u;
7262 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007263
Inada Naoki770847a2019-06-24 12:30:24 +09007264 _PyUnicodeWriter writer;
7265 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007266 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007267
Inada Naoki770847a2019-06-24 12:30:24 +09007268 s += outpos;
7269 int kind = writer.kind;
7270 void *data = writer.data;
7271 Py_ssize_t startinpos, endinpos;
7272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007274 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007276 PyUnicode_WRITE(kind, data, writer.pos, c);
7277 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007279 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007281
7282 /* byte outsize range 0x00..0x7f: call the error handler */
7283
7284 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007285 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007286
7287 switch (error_handler)
7288 {
7289 case _Py_ERROR_REPLACE:
7290 case _Py_ERROR_SURROGATEESCAPE:
7291 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007292 but we may switch to UCS2 at the first write */
7293 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7294 goto onError;
7295 kind = writer.kind;
7296 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007297
7298 if (error_handler == _Py_ERROR_REPLACE)
7299 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7300 else
7301 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7302 writer.pos++;
7303 ++s;
7304 break;
7305
7306 case _Py_ERROR_IGNORE:
7307 ++s;
7308 break;
7309
7310 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 startinpos = s-starts;
7312 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007313 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007314 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 "ascii", "ordinal not in range(128)",
7316 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007317 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007319 kind = writer.kind;
7320 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007323 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007324 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007325 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007326
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007328 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007329 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007330 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331 return NULL;
7332}
7333
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007334/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007335PyObject *
7336PyUnicode_EncodeASCII(const Py_UNICODE *p,
7337 Py_ssize_t size,
7338 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007340 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007341 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007342 if (unicode == NULL)
7343 return NULL;
7344 result = unicode_encode_ucs1(unicode, errors, 128);
7345 Py_DECREF(unicode);
7346 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347}
7348
Alexander Belopolsky40018472011-02-26 01:02:56 +00007349PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007350_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351{
7352 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 PyErr_BadArgument();
7354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007356 if (PyUnicode_READY(unicode) == -1)
7357 return NULL;
7358 /* Fast path: if it is an ASCII-only string, construct bytes object
7359 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007360 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007361 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7362 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007363 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007364}
7365
7366PyObject *
7367PyUnicode_AsASCIIString(PyObject *unicode)
7368{
7369 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370}
7371
Steve Dowercc16be82016-09-08 10:35:16 -07007372#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007373
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007374/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007375
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007376#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377#define NEED_RETRY
7378#endif
7379
Steve Dower7ebdda02019-08-21 16:22:33 -07007380/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7381 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7382 both cases also and avoids partial characters overrunning the
7383 length limit in MultiByteToWideChar on Windows */
7384#define DECODING_CHUNK_SIZE (INT_MAX/4)
7385
Victor Stinner3a50e702011-10-18 21:21:00 +02007386#ifndef WC_ERR_INVALID_CHARS
7387# define WC_ERR_INVALID_CHARS 0x0080
7388#endif
7389
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007390static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007391code_page_name(UINT code_page, PyObject **obj)
7392{
7393 *obj = NULL;
7394 if (code_page == CP_ACP)
7395 return "mbcs";
7396 if (code_page == CP_UTF7)
7397 return "CP_UTF7";
7398 if (code_page == CP_UTF8)
7399 return "CP_UTF8";
7400
7401 *obj = PyBytes_FromFormat("cp%u", code_page);
7402 if (*obj == NULL)
7403 return NULL;
7404 return PyBytes_AS_STRING(*obj);
7405}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406
Victor Stinner3a50e702011-10-18 21:21:00 +02007407static DWORD
7408decode_code_page_flags(UINT code_page)
7409{
7410 if (code_page == CP_UTF7) {
7411 /* The CP_UTF7 decoder only supports flags=0 */
7412 return 0;
7413 }
7414 else
7415 return MB_ERR_INVALID_CHARS;
7416}
7417
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 * Decode a byte string from a Windows code page into unicode object in strict
7420 * mode.
7421 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007422 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7423 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007425static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007426decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007427 wchar_t **buf,
7428 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 const char *in,
7430 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007432 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007433 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435
7436 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007438 while ((outsize = MultiByteToWideChar(code_page, flags,
7439 in, insize, NULL, 0)) <= 0)
7440 {
7441 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7442 goto error;
7443 }
7444 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7445 flags = 0;
7446 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007448 /* Extend a wchar_t* buffer */
7449 Py_ssize_t n = *bufsize; /* Get the current length */
7450 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7451 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007453 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007454
7455 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7457 if (outsize <= 0)
7458 goto error;
7459 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461error:
7462 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7463 return -2;
7464 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007465 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007466}
7467
Victor Stinner3a50e702011-10-18 21:21:00 +02007468/*
7469 * Decode a byte string from a code page into unicode object with an error
7470 * handler.
7471 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007472 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 * UnicodeDecodeError exception and returns -1 on error.
7474 */
7475static int
7476decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007477 wchar_t **buf,
7478 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007479 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007480 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007481{
7482 const char *startin = in;
7483 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007484 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 /* Ideally, we should get reason from FormatMessage. This is the Windows
7486 2000 English version of the message. */
7487 const char *reason = "No mapping for the Unicode character exists "
7488 "in the target code page.";
7489 /* each step cannot decode more than 1 character, but a character can be
7490 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007491 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007492 int insize;
7493 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 PyObject *errorHandler = NULL;
7495 PyObject *exc = NULL;
7496 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007497 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 DWORD err;
7499 int ret = -1;
7500
7501 assert(size > 0);
7502
7503 encoding = code_page_name(code_page, &encoding_obj);
7504 if (encoding == NULL)
7505 return -1;
7506
Victor Stinner7d00cc12014-03-17 23:08:06 +01007507 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7509 UnicodeDecodeError. */
7510 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7511 if (exc != NULL) {
7512 PyCodec_StrictErrors(exc);
7513 Py_CLEAR(exc);
7514 }
7515 goto error;
7516 }
7517
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007518 /* Extend a wchar_t* buffer */
7519 Py_ssize_t n = *bufsize; /* Get the current length */
7520 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7521 PyErr_NoMemory();
7522 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007524 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7525 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007527 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007528
7529 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 while (in < endin)
7531 {
7532 /* Decode a character */
7533 insize = 1;
7534 do
7535 {
7536 outsize = MultiByteToWideChar(code_page, flags,
7537 in, insize,
7538 buffer, Py_ARRAY_LENGTH(buffer));
7539 if (outsize > 0)
7540 break;
7541 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007542 if (err == ERROR_INVALID_FLAGS && flags) {
7543 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7544 flags = 0;
7545 continue;
7546 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 if (err != ERROR_NO_UNICODE_TRANSLATION
7548 && err != ERROR_INSUFFICIENT_BUFFER)
7549 {
7550 PyErr_SetFromWindowsErr(0);
7551 goto error;
7552 }
7553 insize++;
7554 }
7555 /* 4=maximum length of a UTF-8 sequence */
7556 while (insize <= 4 && (in + insize) <= endin);
7557
7558 if (outsize <= 0) {
7559 Py_ssize_t startinpos, endinpos, outpos;
7560
Victor Stinner7d00cc12014-03-17 23:08:06 +01007561 /* last character in partial decode? */
7562 if (in + insize >= endin && !final)
7563 break;
7564
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 startinpos = in - startin;
7566 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007567 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007568 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007569 errors, &errorHandler,
7570 encoding, reason,
7571 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007572 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 {
7574 goto error;
7575 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007576 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 }
7578 else {
7579 in += insize;
7580 memcpy(out, buffer, outsize * sizeof(wchar_t));
7581 out += outsize;
7582 }
7583 }
7584
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007585 /* Shrink the buffer */
7586 assert(out - *buf <= *bufsize);
7587 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007588 /* (in - startin) <= size and size is an int */
7589 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007590
7591error:
7592 Py_XDECREF(encoding_obj);
7593 Py_XDECREF(errorHandler);
7594 Py_XDECREF(exc);
7595 return ret;
7596}
7597
Victor Stinner3a50e702011-10-18 21:21:00 +02007598static PyObject *
7599decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007600 const char *s, Py_ssize_t size,
7601 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007603 wchar_t *buf = NULL;
7604 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007605 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007606
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 if (code_page < 0) {
7608 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7609 return NULL;
7610 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007611 if (size < 0) {
7612 PyErr_BadInternalCall();
7613 return NULL;
7614 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007615
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007616 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007618
Victor Stinner76a31a62011-11-04 00:05:13 +01007619 do
7620 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007621#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007622 if (size > DECODING_CHUNK_SIZE) {
7623 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007624 final = 0;
7625 done = 0;
7626 }
7627 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007629 {
7630 chunk_size = (int)size;
7631 final = (consumed == NULL);
7632 done = 1;
7633 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007634
Victor Stinner76a31a62011-11-04 00:05:13 +01007635 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007636 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007637 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007638 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007639 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007640
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007641 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007642 s, chunk_size);
7643 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007644 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007645 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007646 errors, final);
7647 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007648
7649 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007650 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007651 return NULL;
7652 }
7653
7654 if (consumed)
7655 *consumed += converted;
7656
7657 s += converted;
7658 size -= converted;
7659 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007660
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007661 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7662 PyMem_Free(buf);
7663 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007664}
7665
Alexander Belopolsky40018472011-02-26 01:02:56 +00007666PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007667PyUnicode_DecodeCodePageStateful(int code_page,
7668 const char *s,
7669 Py_ssize_t size,
7670 const char *errors,
7671 Py_ssize_t *consumed)
7672{
7673 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7674}
7675
7676PyObject *
7677PyUnicode_DecodeMBCSStateful(const char *s,
7678 Py_ssize_t size,
7679 const char *errors,
7680 Py_ssize_t *consumed)
7681{
7682 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7683}
7684
7685PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007686PyUnicode_DecodeMBCS(const char *s,
7687 Py_ssize_t size,
7688 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007689{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007690 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7691}
7692
Victor Stinner3a50e702011-10-18 21:21:00 +02007693static DWORD
7694encode_code_page_flags(UINT code_page, const char *errors)
7695{
7696 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007697 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 }
7699 else if (code_page == CP_UTF7) {
7700 /* CP_UTF7 only supports flags=0 */
7701 return 0;
7702 }
7703 else {
7704 if (errors != NULL && strcmp(errors, "replace") == 0)
7705 return 0;
7706 else
7707 return WC_NO_BEST_FIT_CHARS;
7708 }
7709}
7710
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007711/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 * Encode a Unicode string to a Windows code page into a byte string in strict
7713 * mode.
7714 *
7715 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007716 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007717 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007718static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007719encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007720 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007722{
Victor Stinner554f3f02010-06-16 23:33:54 +00007723 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007724 BOOL *pusedDefaultChar = &usedDefaultChar;
7725 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007726 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007727 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007728 const DWORD flags = encode_code_page_flags(code_page, NULL);
7729 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007730 /* Create a substring so that we can get the UTF-16 representation
7731 of just the slice under consideration. */
7732 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007733
Martin v. Löwis3d325192011-11-04 18:23:06 +01007734 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007735
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007737 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007738 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007739 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007740
Victor Stinner2fc507f2011-11-04 20:06:39 +01007741 substring = PyUnicode_Substring(unicode, offset, offset+len);
7742 if (substring == NULL)
7743 return -1;
7744 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7745 if (p == NULL) {
7746 Py_DECREF(substring);
7747 return -1;
7748 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007749 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007750
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007751 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007753 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007754 NULL, 0,
7755 NULL, pusedDefaultChar);
7756 if (outsize <= 0)
7757 goto error;
7758 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007759 if (pusedDefaultChar && *pusedDefaultChar) {
7760 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007761 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007762 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007763
Victor Stinner3a50e702011-10-18 21:21:00 +02007764 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007766 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007767 if (*outbytes == NULL) {
7768 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007770 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007771 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007772 }
7773 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007775 const Py_ssize_t n = PyBytes_Size(*outbytes);
7776 if (outsize > PY_SSIZE_T_MAX - n) {
7777 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007778 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007780 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007781 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7782 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007783 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007784 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007785 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007786 }
7787
7788 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007789 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007790 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007791 out, outsize,
7792 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007793 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007794 if (outsize <= 0)
7795 goto error;
7796 if (pusedDefaultChar && *pusedDefaultChar)
7797 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007798 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007799
Victor Stinner3a50e702011-10-18 21:21:00 +02007800error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007801 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007802 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7803 return -2;
7804 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007805 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007806}
7807
Victor Stinner3a50e702011-10-18 21:21:00 +02007808/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007809 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007810 * error handler.
7811 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007812 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007813 * -1 on other error.
7814 */
7815static int
7816encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007817 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007818 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007819{
Victor Stinner3a50e702011-10-18 21:21:00 +02007820 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007821 Py_ssize_t pos = unicode_offset;
7822 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007823 /* Ideally, we should get reason from FormatMessage. This is the Windows
7824 2000 English version of the message. */
7825 const char *reason = "invalid character";
7826 /* 4=maximum length of a UTF-8 sequence */
7827 char buffer[4];
7828 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7829 Py_ssize_t outsize;
7830 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 PyObject *errorHandler = NULL;
7832 PyObject *exc = NULL;
7833 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007834 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007835 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007836 PyObject *rep;
7837 int ret = -1;
7838
7839 assert(insize > 0);
7840
7841 encoding = code_page_name(code_page, &encoding_obj);
7842 if (encoding == NULL)
7843 return -1;
7844
7845 if (errors == NULL || strcmp(errors, "strict") == 0) {
7846 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7847 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007848 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007849 if (exc != NULL) {
7850 PyCodec_StrictErrors(exc);
7851 Py_DECREF(exc);
7852 }
7853 Py_XDECREF(encoding_obj);
7854 return -1;
7855 }
7856
7857 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7858 pusedDefaultChar = &usedDefaultChar;
7859 else
7860 pusedDefaultChar = NULL;
7861
7862 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7863 PyErr_NoMemory();
7864 goto error;
7865 }
7866 outsize = insize * Py_ARRAY_LENGTH(buffer);
7867
7868 if (*outbytes == NULL) {
7869 /* Create string object */
7870 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7871 if (*outbytes == NULL)
7872 goto error;
7873 out = PyBytes_AS_STRING(*outbytes);
7874 }
7875 else {
7876 /* Extend string object */
7877 Py_ssize_t n = PyBytes_Size(*outbytes);
7878 if (n > PY_SSIZE_T_MAX - outsize) {
7879 PyErr_NoMemory();
7880 goto error;
7881 }
7882 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7883 goto error;
7884 out = PyBytes_AS_STRING(*outbytes) + n;
7885 }
7886
7887 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007888 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007889 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007890 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7891 wchar_t chars[2];
7892 int charsize;
7893 if (ch < 0x10000) {
7894 chars[0] = (wchar_t)ch;
7895 charsize = 1;
7896 }
7897 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007898 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7899 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007900 charsize = 2;
7901 }
7902
Victor Stinner3a50e702011-10-18 21:21:00 +02007903 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007904 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007905 buffer, Py_ARRAY_LENGTH(buffer),
7906 NULL, pusedDefaultChar);
7907 if (outsize > 0) {
7908 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7909 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007910 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007911 memcpy(out, buffer, outsize);
7912 out += outsize;
7913 continue;
7914 }
7915 }
7916 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7917 PyErr_SetFromWindowsErr(0);
7918 goto error;
7919 }
7920
Victor Stinner3a50e702011-10-18 21:21:00 +02007921 rep = unicode_encode_call_errorhandler(
7922 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007923 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007924 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007925 if (rep == NULL)
7926 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007927 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007928
7929 if (PyBytes_Check(rep)) {
7930 outsize = PyBytes_GET_SIZE(rep);
7931 if (outsize != 1) {
7932 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7933 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7934 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7935 Py_DECREF(rep);
7936 goto error;
7937 }
7938 out = PyBytes_AS_STRING(*outbytes) + offset;
7939 }
7940 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7941 out += outsize;
7942 }
7943 else {
7944 Py_ssize_t i;
7945 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007946 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007947
Benjamin Petersonbac79492012-01-14 13:34:47 -05007948 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007949 Py_DECREF(rep);
7950 goto error;
7951 }
7952
7953 outsize = PyUnicode_GET_LENGTH(rep);
7954 if (outsize != 1) {
7955 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7956 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7957 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7958 Py_DECREF(rep);
7959 goto error;
7960 }
7961 out = PyBytes_AS_STRING(*outbytes) + offset;
7962 }
7963 kind = PyUnicode_KIND(rep);
7964 data = PyUnicode_DATA(rep);
7965 for (i=0; i < outsize; i++) {
7966 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7967 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007968 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007969 encoding, unicode,
7970 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007971 "unable to encode error handler result to ASCII");
7972 Py_DECREF(rep);
7973 goto error;
7974 }
7975 *out = (unsigned char)ch;
7976 out++;
7977 }
7978 }
7979 Py_DECREF(rep);
7980 }
7981 /* write a NUL byte */
7982 *out = 0;
7983 outsize = out - PyBytes_AS_STRING(*outbytes);
7984 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7985 if (_PyBytes_Resize(outbytes, outsize) < 0)
7986 goto error;
7987 ret = 0;
7988
7989error:
7990 Py_XDECREF(encoding_obj);
7991 Py_XDECREF(errorHandler);
7992 Py_XDECREF(exc);
7993 return ret;
7994}
7995
Victor Stinner3a50e702011-10-18 21:21:00 +02007996static PyObject *
7997encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007998 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007999 const char *errors)
8000{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008001 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008002 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008003 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008004 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008005
Victor Stinner29dacf22015-01-26 16:41:32 +01008006 if (!PyUnicode_Check(unicode)) {
8007 PyErr_BadArgument();
8008 return NULL;
8009 }
8010
Benjamin Petersonbac79492012-01-14 13:34:47 -05008011 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008012 return NULL;
8013 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008014
Victor Stinner3a50e702011-10-18 21:21:00 +02008015 if (code_page < 0) {
8016 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8017 return NULL;
8018 }
8019
Martin v. Löwis3d325192011-11-04 18:23:06 +01008020 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008021 return PyBytes_FromStringAndSize(NULL, 0);
8022
Victor Stinner7581cef2011-11-03 22:32:33 +01008023 offset = 0;
8024 do
8025 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008026#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008027 if (len > DECODING_CHUNK_SIZE) {
8028 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008029 done = 0;
8030 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008031 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008032#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008033 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008034 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008035 done = 1;
8036 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008037
Victor Stinner76a31a62011-11-04 00:05:13 +01008038 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008039 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008040 errors);
8041 if (ret == -2)
8042 ret = encode_code_page_errors(code_page, &outbytes,
8043 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008044 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008045 if (ret < 0) {
8046 Py_XDECREF(outbytes);
8047 return NULL;
8048 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008049
Victor Stinner7581cef2011-11-03 22:32:33 +01008050 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008051 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008052 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008053
Victor Stinner3a50e702011-10-18 21:21:00 +02008054 return outbytes;
8055}
8056
8057PyObject *
8058PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8059 Py_ssize_t size,
8060 const char *errors)
8061{
Victor Stinner7581cef2011-11-03 22:32:33 +01008062 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008063 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008064 if (unicode == NULL)
8065 return NULL;
8066 res = encode_code_page(CP_ACP, unicode, errors);
8067 Py_DECREF(unicode);
8068 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008069}
8070
8071PyObject *
8072PyUnicode_EncodeCodePage(int code_page,
8073 PyObject *unicode,
8074 const char *errors)
8075{
Victor Stinner7581cef2011-11-03 22:32:33 +01008076 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008077}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008078
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079PyObject *
8080PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008081{
Victor Stinner7581cef2011-11-03 22:32:33 +01008082 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008083}
8084
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008085#undef NEED_RETRY
8086
Steve Dowercc16be82016-09-08 10:35:16 -07008087#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008088
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089/* --- Character Mapping Codec -------------------------------------------- */
8090
Victor Stinnerfb161b12013-04-18 01:44:27 +02008091static int
8092charmap_decode_string(const char *s,
8093 Py_ssize_t size,
8094 PyObject *mapping,
8095 const char *errors,
8096 _PyUnicodeWriter *writer)
8097{
8098 const char *starts = s;
8099 const char *e;
8100 Py_ssize_t startinpos, endinpos;
8101 PyObject *errorHandler = NULL, *exc = NULL;
8102 Py_ssize_t maplen;
8103 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008104 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008105 Py_UCS4 x;
8106 unsigned char ch;
8107
8108 if (PyUnicode_READY(mapping) == -1)
8109 return -1;
8110
8111 maplen = PyUnicode_GET_LENGTH(mapping);
8112 mapdata = PyUnicode_DATA(mapping);
8113 mapkind = PyUnicode_KIND(mapping);
8114
8115 e = s + size;
8116
8117 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8118 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8119 * is disabled in encoding aliases, latin1 is preferred because
8120 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008121 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008122 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8123 Py_UCS4 maxchar = writer->maxchar;
8124
8125 assert (writer->kind == PyUnicode_1BYTE_KIND);
8126 while (s < e) {
8127 ch = *s;
8128 x = mapdata_ucs1[ch];
8129 if (x > maxchar) {
8130 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8131 goto onError;
8132 maxchar = writer->maxchar;
8133 outdata = (Py_UCS1 *)writer->data;
8134 }
8135 outdata[writer->pos] = x;
8136 writer->pos++;
8137 ++s;
8138 }
8139 return 0;
8140 }
8141
8142 while (s < e) {
8143 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8144 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008145 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008146 if (outkind == PyUnicode_1BYTE_KIND) {
8147 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8148 Py_UCS4 maxchar = writer->maxchar;
8149 while (s < e) {
8150 ch = *s;
8151 x = mapdata_ucs2[ch];
8152 if (x > maxchar)
8153 goto Error;
8154 outdata[writer->pos] = x;
8155 writer->pos++;
8156 ++s;
8157 }
8158 break;
8159 }
8160 else if (outkind == PyUnicode_2BYTE_KIND) {
8161 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8162 while (s < e) {
8163 ch = *s;
8164 x = mapdata_ucs2[ch];
8165 if (x == 0xFFFE)
8166 goto Error;
8167 outdata[writer->pos] = x;
8168 writer->pos++;
8169 ++s;
8170 }
8171 break;
8172 }
8173 }
8174 ch = *s;
8175
8176 if (ch < maplen)
8177 x = PyUnicode_READ(mapkind, mapdata, ch);
8178 else
8179 x = 0xfffe; /* invalid value */
8180Error:
8181 if (x == 0xfffe)
8182 {
8183 /* undefined mapping */
8184 startinpos = s-starts;
8185 endinpos = startinpos+1;
8186 if (unicode_decode_call_errorhandler_writer(
8187 errors, &errorHandler,
8188 "charmap", "character maps to <undefined>",
8189 &starts, &e, &startinpos, &endinpos, &exc, &s,
8190 writer)) {
8191 goto onError;
8192 }
8193 continue;
8194 }
8195
8196 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8197 goto onError;
8198 ++s;
8199 }
8200 Py_XDECREF(errorHandler);
8201 Py_XDECREF(exc);
8202 return 0;
8203
8204onError:
8205 Py_XDECREF(errorHandler);
8206 Py_XDECREF(exc);
8207 return -1;
8208}
8209
8210static int
8211charmap_decode_mapping(const char *s,
8212 Py_ssize_t size,
8213 PyObject *mapping,
8214 const char *errors,
8215 _PyUnicodeWriter *writer)
8216{
8217 const char *starts = s;
8218 const char *e;
8219 Py_ssize_t startinpos, endinpos;
8220 PyObject *errorHandler = NULL, *exc = NULL;
8221 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008222 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008223
8224 e = s + size;
8225
8226 while (s < e) {
8227 ch = *s;
8228
8229 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8230 key = PyLong_FromLong((long)ch);
8231 if (key == NULL)
8232 goto onError;
8233
8234 item = PyObject_GetItem(mapping, key);
8235 Py_DECREF(key);
8236 if (item == NULL) {
8237 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8238 /* No mapping found means: mapping is undefined. */
8239 PyErr_Clear();
8240 goto Undefined;
8241 } else
8242 goto onError;
8243 }
8244
8245 /* Apply mapping */
8246 if (item == Py_None)
8247 goto Undefined;
8248 if (PyLong_Check(item)) {
8249 long value = PyLong_AS_LONG(item);
8250 if (value == 0xFFFE)
8251 goto Undefined;
8252 if (value < 0 || value > MAX_UNICODE) {
8253 PyErr_Format(PyExc_TypeError,
8254 "character mapping must be in range(0x%lx)",
8255 (unsigned long)MAX_UNICODE + 1);
8256 goto onError;
8257 }
8258
8259 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8260 goto onError;
8261 }
8262 else if (PyUnicode_Check(item)) {
8263 if (PyUnicode_READY(item) == -1)
8264 goto onError;
8265 if (PyUnicode_GET_LENGTH(item) == 1) {
8266 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8267 if (value == 0xFFFE)
8268 goto Undefined;
8269 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8270 goto onError;
8271 }
8272 else {
8273 writer->overallocate = 1;
8274 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8275 goto onError;
8276 }
8277 }
8278 else {
8279 /* wrong return value */
8280 PyErr_SetString(PyExc_TypeError,
8281 "character mapping must return integer, None or str");
8282 goto onError;
8283 }
8284 Py_CLEAR(item);
8285 ++s;
8286 continue;
8287
8288Undefined:
8289 /* undefined mapping */
8290 Py_CLEAR(item);
8291 startinpos = s-starts;
8292 endinpos = startinpos+1;
8293 if (unicode_decode_call_errorhandler_writer(
8294 errors, &errorHandler,
8295 "charmap", "character maps to <undefined>",
8296 &starts, &e, &startinpos, &endinpos, &exc, &s,
8297 writer)) {
8298 goto onError;
8299 }
8300 }
8301 Py_XDECREF(errorHandler);
8302 Py_XDECREF(exc);
8303 return 0;
8304
8305onError:
8306 Py_XDECREF(item);
8307 Py_XDECREF(errorHandler);
8308 Py_XDECREF(exc);
8309 return -1;
8310}
8311
Alexander Belopolsky40018472011-02-26 01:02:56 +00008312PyObject *
8313PyUnicode_DecodeCharmap(const char *s,
8314 Py_ssize_t size,
8315 PyObject *mapping,
8316 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008318 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008319
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320 /* Default to Latin-1 */
8321 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008325 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008326 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008327 writer.min_length = size;
8328 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008330
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008331 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008332 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8333 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008334 }
8335 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008336 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8337 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008339 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008340
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008342 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 return NULL;
8344}
8345
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008346/* Charmap encoding: the lookup table */
8347
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 PyObject_HEAD
8350 unsigned char level1[32];
8351 int count2, count3;
8352 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353};
8354
8355static PyObject*
8356encoding_map_size(PyObject *obj, PyObject* args)
8357{
8358 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008359 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008361}
8362
8363static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008364 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 PyDoc_STR("Return the size (in bytes) of this object") },
8366 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008367};
8368
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008369static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008370 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 "EncodingMap", /*tp_name*/
8372 sizeof(struct encoding_map), /*tp_basicsize*/
8373 0, /*tp_itemsize*/
8374 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008375 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008376 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 0, /*tp_getattr*/
8378 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008379 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 0, /*tp_repr*/
8381 0, /*tp_as_number*/
8382 0, /*tp_as_sequence*/
8383 0, /*tp_as_mapping*/
8384 0, /*tp_hash*/
8385 0, /*tp_call*/
8386 0, /*tp_str*/
8387 0, /*tp_getattro*/
8388 0, /*tp_setattro*/
8389 0, /*tp_as_buffer*/
8390 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8391 0, /*tp_doc*/
8392 0, /*tp_traverse*/
8393 0, /*tp_clear*/
8394 0, /*tp_richcompare*/
8395 0, /*tp_weaklistoffset*/
8396 0, /*tp_iter*/
8397 0, /*tp_iternext*/
8398 encoding_map_methods, /*tp_methods*/
8399 0, /*tp_members*/
8400 0, /*tp_getset*/
8401 0, /*tp_base*/
8402 0, /*tp_dict*/
8403 0, /*tp_descr_get*/
8404 0, /*tp_descr_set*/
8405 0, /*tp_dictoffset*/
8406 0, /*tp_init*/
8407 0, /*tp_alloc*/
8408 0, /*tp_new*/
8409 0, /*tp_free*/
8410 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008411};
8412
8413PyObject*
8414PyUnicode_BuildEncodingMap(PyObject* string)
8415{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416 PyObject *result;
8417 struct encoding_map *mresult;
8418 int i;
8419 int need_dict = 0;
8420 unsigned char level1[32];
8421 unsigned char level2[512];
8422 unsigned char *mlevel1, *mlevel2, *mlevel3;
8423 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008425 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008426 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008428
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008429 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 PyErr_BadArgument();
8431 return NULL;
8432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 kind = PyUnicode_KIND(string);
8434 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008435 length = PyUnicode_GET_LENGTH(string);
8436 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008437 memset(level1, 0xFF, sizeof level1);
8438 memset(level2, 0xFF, sizeof level2);
8439
8440 /* If there isn't a one-to-one mapping of NULL to \0,
8441 or if there are non-BMP characters, we need to use
8442 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008444 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008445 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008446 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008447 ch = PyUnicode_READ(kind, data, i);
8448 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008449 need_dict = 1;
8450 break;
8451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008453 /* unmapped character */
8454 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 l1 = ch >> 11;
8456 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008457 if (level1[l1] == 0xFF)
8458 level1[l1] = count2++;
8459 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008461 }
8462
8463 if (count2 >= 0xFF || count3 >= 0xFF)
8464 need_dict = 1;
8465
8466 if (need_dict) {
8467 PyObject *result = PyDict_New();
8468 PyObject *key, *value;
8469 if (!result)
8470 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008471 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008473 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008474 if (!key || !value)
8475 goto failed1;
8476 if (PyDict_SetItem(result, key, value) == -1)
8477 goto failed1;
8478 Py_DECREF(key);
8479 Py_DECREF(value);
8480 }
8481 return result;
8482 failed1:
8483 Py_XDECREF(key);
8484 Py_XDECREF(value);
8485 Py_DECREF(result);
8486 return NULL;
8487 }
8488
8489 /* Create a three-level trie */
8490 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8491 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008492 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008493 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008494 }
8495
8496 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008497 mresult = (struct encoding_map*)result;
8498 mresult->count2 = count2;
8499 mresult->count3 = count3;
8500 mlevel1 = mresult->level1;
8501 mlevel2 = mresult->level23;
8502 mlevel3 = mresult->level23 + 16*count2;
8503 memcpy(mlevel1, level1, 32);
8504 memset(mlevel2, 0xFF, 16*count2);
8505 memset(mlevel3, 0, 128*count3);
8506 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008507 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008508 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008509 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8510 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008511 /* unmapped character */
8512 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008513 o1 = ch>>11;
8514 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008515 i2 = 16*mlevel1[o1] + o2;
8516 if (mlevel2[i2] == 0xFF)
8517 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008518 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008519 i3 = 128*mlevel2[i2] + o3;
8520 mlevel3[i3] = i;
8521 }
8522 return result;
8523}
8524
8525static int
Victor Stinner22168992011-11-20 17:09:18 +01008526encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527{
8528 struct encoding_map *map = (struct encoding_map*)mapping;
8529 int l1 = c>>11;
8530 int l2 = (c>>7) & 0xF;
8531 int l3 = c & 0x7F;
8532 int i;
8533
Victor Stinner22168992011-11-20 17:09:18 +01008534 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008536 if (c == 0)
8537 return 0;
8538 /* level 1*/
8539 i = map->level1[l1];
8540 if (i == 0xFF) {
8541 return -1;
8542 }
8543 /* level 2*/
8544 i = map->level23[16*i+l2];
8545 if (i == 0xFF) {
8546 return -1;
8547 }
8548 /* level 3 */
8549 i = map->level23[16*map->count2 + 128*i + l3];
8550 if (i == 0) {
8551 return -1;
8552 }
8553 return i;
8554}
8555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556/* Lookup the character ch in the mapping. If the character
8557 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008558 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008559static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008560charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561{
Christian Heimes217cfd12007-12-02 14:31:20 +00008562 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 PyObject *x;
8564
8565 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 x = PyObject_GetItem(mapping, w);
8568 Py_DECREF(w);
8569 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8571 /* No mapping found means: mapping is undefined. */
8572 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008573 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 } else
8575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008577 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008579 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 long value = PyLong_AS_LONG(x);
8581 if (value < 0 || value > 255) {
8582 PyErr_SetString(PyExc_TypeError,
8583 "character mapping must be in range(256)");
8584 Py_DECREF(x);
8585 return NULL;
8586 }
8587 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008589 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 /* wrong return value */
8593 PyErr_Format(PyExc_TypeError,
8594 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008595 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 Py_DECREF(x);
8597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 }
8599}
8600
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008601static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008602charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008603{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008604 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8605 /* exponentially overallocate to minimize reallocations */
8606 if (requiredsize < 2*outsize)
8607 requiredsize = 2*outsize;
8608 if (_PyBytes_Resize(outobj, requiredsize))
8609 return -1;
8610 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008611}
8612
Benjamin Peterson14339b62009-01-31 16:36:08 +00008613typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008615} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008617 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 space is available. Return a new reference to the object that
8619 was put in the output buffer, or Py_None, if the mapping was undefined
8620 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008621 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008622static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008623charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008624 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008626 PyObject *rep;
8627 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008628 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629
Andy Lesterdffe4c02020-03-04 07:15:20 -06008630 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008631 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008633 if (res == -1)
8634 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 if (outsize<requiredsize)
8636 if (charmapencode_resize(outobj, outpos, requiredsize))
8637 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008638 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 outstart[(*outpos)++] = (char)res;
8640 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008641 }
8642
8643 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008646 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 Py_DECREF(rep);
8648 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008649 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 if (PyLong_Check(rep)) {
8651 Py_ssize_t requiredsize = *outpos+1;
8652 if (outsize<requiredsize)
8653 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8654 Py_DECREF(rep);
8655 return enc_EXCEPTION;
8656 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008657 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 else {
8661 const char *repchars = PyBytes_AS_STRING(rep);
8662 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8663 Py_ssize_t requiredsize = *outpos+repsize;
8664 if (outsize<requiredsize)
8665 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8666 Py_DECREF(rep);
8667 return enc_EXCEPTION;
8668 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008669 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 memcpy(outstart + *outpos, repchars, repsize);
8671 *outpos += repsize;
8672 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008674 Py_DECREF(rep);
8675 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676}
8677
8678/* handle an error in PyUnicode_EncodeCharmap
8679 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008680static int
8681charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008682 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008684 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008685 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686{
8687 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008688 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008689 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008690 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008691 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008692 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 Py_ssize_t collstartpos = *inpos;
8695 Py_ssize_t collendpos = *inpos+1;
8696 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008697 const char *encoding = "charmap";
8698 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008699 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008700 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008701 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702
Benjamin Petersonbac79492012-01-14 13:34:47 -05008703 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008704 return -1;
8705 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 /* find all unencodable characters */
8707 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008708 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008709 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008710 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008711 val = encoding_map_lookup(ch, mapping);
8712 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 break;
8714 ++collendpos;
8715 continue;
8716 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008717
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008718 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8719 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 if (rep==NULL)
8721 return -1;
8722 else if (rep!=Py_None) {
8723 Py_DECREF(rep);
8724 break;
8725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008726 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
8729 /* cache callback name lookup
8730 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008731 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008732 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008733
8734 switch (*error_handler) {
8735 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008736 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008737 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008738
8739 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008740 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 x = charmapencode_output('?', mapping, res, respos);
8742 if (x==enc_EXCEPTION) {
8743 return -1;
8744 }
8745 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008746 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 return -1;
8748 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008749 }
8750 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008751 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752 *inpos = collendpos;
8753 break;
Victor Stinner50149202015-09-22 00:26:54 +02008754
8755 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008756 /* generate replacement (temporarily (mis)uses p) */
8757 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 char buffer[2+29+1+1];
8759 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008760 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 for (cp = buffer; *cp; ++cp) {
8762 x = charmapencode_output(*cp, mapping, res, respos);
8763 if (x==enc_EXCEPTION)
8764 return -1;
8765 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008766 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 return -1;
8768 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008769 }
8770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008771 *inpos = collendpos;
8772 break;
Victor Stinner50149202015-09-22 00:26:54 +02008773
Benjamin Peterson14339b62009-01-31 16:36:08 +00008774 default:
Victor Stinner50149202015-09-22 00:26:54 +02008775 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008776 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008778 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008780 if (PyBytes_Check(repunicode)) {
8781 /* Directly copy bytes result to output. */
8782 Py_ssize_t outsize = PyBytes_Size(*res);
8783 Py_ssize_t requiredsize;
8784 repsize = PyBytes_Size(repunicode);
8785 requiredsize = *respos + repsize;
8786 if (requiredsize > outsize)
8787 /* Make room for all additional bytes. */
8788 if (charmapencode_resize(res, respos, requiredsize)) {
8789 Py_DECREF(repunicode);
8790 return -1;
8791 }
8792 memcpy(PyBytes_AsString(*res) + *respos,
8793 PyBytes_AsString(repunicode), repsize);
8794 *respos += repsize;
8795 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008796 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008797 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008798 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008799 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008800 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008801 Py_DECREF(repunicode);
8802 return -1;
8803 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008804 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008805 data = PyUnicode_DATA(repunicode);
8806 kind = PyUnicode_KIND(repunicode);
8807 for (index = 0; index < repsize; index++) {
8808 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8809 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008811 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 return -1;
8813 }
8814 else if (x==enc_FAILED) {
8815 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008816 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 return -1;
8818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008819 }
8820 *inpos = newpos;
8821 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822 }
8823 return 0;
8824}
8825
Alexander Belopolsky40018472011-02-26 01:02:56 +00008826PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008827_PyUnicode_EncodeCharmap(PyObject *unicode,
8828 PyObject *mapping,
8829 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008831 /* output object */
8832 PyObject *res = NULL;
8833 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008834 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008835 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008836 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008837 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008838 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008839 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008840 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008841 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008842 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843
Benjamin Petersonbac79492012-01-14 13:34:47 -05008844 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008845 return NULL;
8846 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008847 data = PyUnicode_DATA(unicode);
8848 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008849
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 /* Default to Latin-1 */
8851 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008852 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008854 /* allocate enough for a simple encoding without
8855 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008856 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857 if (res == NULL)
8858 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008859 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008862 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008863 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008865 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 if (x==enc_EXCEPTION) /* error */
8867 goto onError;
8868 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008869 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008871 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 &res, &respos)) {
8873 goto onError;
8874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008875 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 else
8877 /* done with this character => adjust input position */
8878 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008881 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008882 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008883 if (_PyBytes_Resize(&res, respos) < 0)
8884 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008885
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008887 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008888 return res;
8889
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891 Py_XDECREF(res);
8892 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008893 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 return NULL;
8895}
8896
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008897/* Deprecated */
8898PyObject *
8899PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8900 Py_ssize_t size,
8901 PyObject *mapping,
8902 const char *errors)
8903{
8904 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008905 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008906 if (unicode == NULL)
8907 return NULL;
8908 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8909 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008910 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008911}
8912
Alexander Belopolsky40018472011-02-26 01:02:56 +00008913PyObject *
8914PyUnicode_AsCharmapString(PyObject *unicode,
8915 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916{
8917 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 PyErr_BadArgument();
8919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008921 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922}
8923
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008924/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008925static void
8926make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008928 Py_ssize_t startpos, Py_ssize_t endpos,
8929 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 *exceptionObject = _PyUnicodeTranslateError_Create(
8933 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 }
8935 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8937 goto onError;
8938 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8939 goto onError;
8940 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8941 goto onError;
8942 return;
8943 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008944 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 }
8946}
8947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948/* error handling callback helper:
8949 build arguments, call the callback and check the arguments,
8950 put the result into newpos and return the replacement string, which
8951 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008952static PyObject *
8953unicode_translate_call_errorhandler(const char *errors,
8954 PyObject **errorHandler,
8955 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008957 Py_ssize_t startpos, Py_ssize_t endpos,
8958 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008959{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008960 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008962 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008963 PyObject *restuple;
8964 PyObject *resunicode;
8965
8966 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008968 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008970 }
8971
8972 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008974 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976
Petr Viktorinffd97532020-02-11 17:46:57 +01008977 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008978 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008981 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 Py_DECREF(restuple);
8983 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008984 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008985 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 &resunicode, &i_newpos)) {
8987 Py_DECREF(restuple);
8988 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008990 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008992 else
8993 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008995 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 Py_DECREF(restuple);
8997 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008998 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008999 Py_INCREF(resunicode);
9000 Py_DECREF(restuple);
9001 return resunicode;
9002}
9003
9004/* Lookup the character ch in the mapping and put the result in result,
9005 which must be decrefed by the caller.
9006 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009007static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009009{
Christian Heimes217cfd12007-12-02 14:31:20 +00009010 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009011 PyObject *x;
9012
9013 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009015 x = PyObject_GetItem(mapping, w);
9016 Py_DECREF(w);
9017 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9019 /* No mapping found means: use 1:1 mapping. */
9020 PyErr_Clear();
9021 *result = NULL;
9022 return 0;
9023 } else
9024 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009025 }
9026 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 *result = x;
9028 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009029 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009030 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009032 if (value < 0 || value > MAX_UNICODE) {
9033 PyErr_Format(PyExc_ValueError,
9034 "character mapping must be in range(0x%x)",
9035 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 Py_DECREF(x);
9037 return -1;
9038 }
9039 *result = x;
9040 return 0;
9041 }
9042 else if (PyUnicode_Check(x)) {
9043 *result = x;
9044 return 0;
9045 }
9046 else {
9047 /* wrong return value */
9048 PyErr_SetString(PyExc_TypeError,
9049 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009050 Py_DECREF(x);
9051 return -1;
9052 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009053}
Victor Stinner1194ea02014-04-04 19:37:40 +02009054
9055/* lookup the character, write the result into the writer.
9056 Return 1 if the result was written into the writer, return 0 if the mapping
9057 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009058static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009059charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9060 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009061{
Victor Stinner1194ea02014-04-04 19:37:40 +02009062 PyObject *item;
9063
9064 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009066
9067 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009069 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009072 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009073 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009074
9075 if (item == Py_None) {
9076 Py_DECREF(item);
9077 return 0;
9078 }
9079
9080 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009081 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9082 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9083 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009084 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9085 Py_DECREF(item);
9086 return -1;
9087 }
9088 Py_DECREF(item);
9089 return 1;
9090 }
9091
9092 if (!PyUnicode_Check(item)) {
9093 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009095 }
9096
9097 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9098 Py_DECREF(item);
9099 return -1;
9100 }
9101
9102 Py_DECREF(item);
9103 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009104}
9105
Victor Stinner89a76ab2014-04-05 11:44:04 +02009106static int
9107unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9108 Py_UCS1 *translate)
9109{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009110 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009111 int ret = 0;
9112
Victor Stinner89a76ab2014-04-05 11:44:04 +02009113 if (charmaptranslate_lookup(ch, mapping, &item)) {
9114 return -1;
9115 }
9116
9117 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009118 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009119 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009120 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009121 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009122 /* not found => default to 1:1 mapping */
9123 translate[ch] = ch;
9124 return 1;
9125 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009126 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009127 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009128 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9129 used it */
9130 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009131 /* invalid character or character outside ASCII:
9132 skip the fast translate */
9133 goto exit;
9134 }
9135 translate[ch] = (Py_UCS1)replace;
9136 }
9137 else if (PyUnicode_Check(item)) {
9138 Py_UCS4 replace;
9139
9140 if (PyUnicode_READY(item) == -1) {
9141 Py_DECREF(item);
9142 return -1;
9143 }
9144 if (PyUnicode_GET_LENGTH(item) != 1)
9145 goto exit;
9146
9147 replace = PyUnicode_READ_CHAR(item, 0);
9148 if (replace > 127)
9149 goto exit;
9150 translate[ch] = (Py_UCS1)replace;
9151 }
9152 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009153 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009154 goto exit;
9155 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009156 ret = 1;
9157
Benjamin Peterson1365de72014-04-07 20:15:41 -04009158 exit:
9159 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009160 return ret;
9161}
9162
9163/* Fast path for ascii => ascii translation. Return 1 if the whole string
9164 was translated into writer, return 0 if the input string was partially
9165 translated into writer, raise an exception and return -1 on error. */
9166static int
9167unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009168 _PyUnicodeWriter *writer, int ignore,
9169 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009170{
Victor Stinner872b2912014-04-05 14:27:07 +02009171 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009172 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009173 const Py_UCS1 *in, *end;
9174 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009175 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009176
Victor Stinner89a76ab2014-04-05 11:44:04 +02009177 len = PyUnicode_GET_LENGTH(input);
9178
Victor Stinner872b2912014-04-05 14:27:07 +02009179 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009180
9181 in = PyUnicode_1BYTE_DATA(input);
9182 end = in + len;
9183
9184 assert(PyUnicode_IS_ASCII(writer->buffer));
9185 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9186 out = PyUnicode_1BYTE_DATA(writer->buffer);
9187
Victor Stinner872b2912014-04-05 14:27:07 +02009188 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009189 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009190 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009191 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009192 int translate = unicode_fast_translate_lookup(mapping, ch,
9193 ascii_table);
9194 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009195 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009196 if (translate == 0)
9197 goto exit;
9198 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009199 }
Victor Stinner872b2912014-04-05 14:27:07 +02009200 if (ch2 == 0xfe) {
9201 if (ignore)
9202 continue;
9203 goto exit;
9204 }
9205 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009206 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009207 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009208 }
Victor Stinner872b2912014-04-05 14:27:07 +02009209 res = 1;
9210
9211exit:
9212 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009213 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009214 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009215}
9216
Victor Stinner3222da22015-10-01 22:07:32 +02009217static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218_PyUnicode_TranslateCharmap(PyObject *input,
9219 PyObject *mapping,
9220 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009223 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 Py_ssize_t size, i;
9225 int kind;
9226 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009227 _PyUnicodeWriter writer;
9228 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009229 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009230 PyObject *errorHandler = NULL;
9231 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009232 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009233 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009234
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 PyErr_BadArgument();
9237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 if (PyUnicode_READY(input) == -1)
9241 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009242 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 kind = PyUnicode_KIND(input);
9244 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009246 if (size == 0)
9247 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009249 /* allocate enough for a simple 1:1 translation without
9250 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009251 _PyUnicodeWriter_Init(&writer);
9252 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009253 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254
Victor Stinner872b2912014-04-05 14:27:07 +02009255 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9256
Victor Stinner33798672016-03-01 21:59:58 +01009257 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009258 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009259 if (PyUnicode_IS_ASCII(input)) {
9260 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9261 if (res < 0) {
9262 _PyUnicodeWriter_Dealloc(&writer);
9263 return NULL;
9264 }
9265 if (res == 1)
9266 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009267 }
Victor Stinner33798672016-03-01 21:59:58 +01009268 else {
9269 i = 0;
9270 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009273 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009274 int translate;
9275 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9276 Py_ssize_t newpos;
9277 /* startpos for collecting untranslatable chars */
9278 Py_ssize_t collstart;
9279 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009280 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281
Victor Stinner1194ea02014-04-04 19:37:40 +02009282 ch = PyUnicode_READ(kind, data, i);
9283 translate = charmaptranslate_output(ch, mapping, &writer);
9284 if (translate < 0)
9285 goto onError;
9286
9287 if (translate != 0) {
9288 /* it worked => adjust input pointer */
9289 ++i;
9290 continue;
9291 }
9292
9293 /* untranslatable character */
9294 collstart = i;
9295 collend = i+1;
9296
9297 /* find all untranslatable characters */
9298 while (collend < size) {
9299 PyObject *x;
9300 ch = PyUnicode_READ(kind, data, collend);
9301 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009302 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009303 Py_XDECREF(x);
9304 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009306 ++collend;
9307 }
9308
9309 if (ignore) {
9310 i = collend;
9311 }
9312 else {
9313 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9314 reason, input, &exc,
9315 collstart, collend, &newpos);
9316 if (repunicode == NULL)
9317 goto onError;
9318 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009320 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009321 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009322 Py_DECREF(repunicode);
9323 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009324 }
9325 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009326 Py_XDECREF(exc);
9327 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009328 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009331 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009332 Py_XDECREF(exc);
9333 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334 return NULL;
9335}
9336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337/* Deprecated. Use PyUnicode_Translate instead. */
9338PyObject *
9339PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9340 Py_ssize_t size,
9341 PyObject *mapping,
9342 const char *errors)
9343{
Christian Heimes5f520f42012-09-11 14:03:25 +02009344 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009345 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 if (!unicode)
9347 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009348 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9349 Py_DECREF(unicode);
9350 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351}
9352
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353PyObject *
9354PyUnicode_Translate(PyObject *str,
9355 PyObject *mapping,
9356 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009358 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009359 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009360 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361}
Tim Petersced69f82003-09-16 20:30:58 +00009362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363PyObject *
9364_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9365{
9366 if (!PyUnicode_Check(unicode)) {
9367 PyErr_BadInternalCall();
9368 return NULL;
9369 }
9370 if (PyUnicode_READY(unicode) == -1)
9371 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009372 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 /* If the string is already ASCII, just return the same string */
9374 Py_INCREF(unicode);
9375 return unicode;
9376 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009377
9378 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9379 PyObject *result = PyUnicode_New(len, 127);
9380 if (result == NULL) {
9381 return NULL;
9382 }
9383
9384 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9385 int kind = PyUnicode_KIND(unicode);
9386 const void *data = PyUnicode_DATA(unicode);
9387 Py_ssize_t i;
9388 for (i = 0; i < len; ++i) {
9389 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9390 if (ch < 127) {
9391 out[i] = ch;
9392 }
9393 else if (Py_UNICODE_ISSPACE(ch)) {
9394 out[i] = ' ';
9395 }
9396 else {
9397 int decimal = Py_UNICODE_TODECIMAL(ch);
9398 if (decimal < 0) {
9399 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009400 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009401 _PyUnicode_LENGTH(result) = i + 1;
9402 break;
9403 }
9404 out[i] = '0' + decimal;
9405 }
9406 }
9407
INADA Naoki16dfca42018-07-14 12:06:43 +09009408 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009409 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410}
9411
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009412PyObject *
9413PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9414 Py_ssize_t length)
9415{
Victor Stinnerf0124502011-11-21 23:12:56 +01009416 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009417 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009418 Py_UCS4 maxchar;
9419 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009420 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009421
Victor Stinner99d7ad02012-02-22 13:37:39 +01009422 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009423 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009424 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009425 if (ch > 127) {
9426 int decimal = Py_UNICODE_TODECIMAL(ch);
9427 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009428 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009429 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009430 }
9431 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009432
9433 /* Copy to a new string */
9434 decimal = PyUnicode_New(length, maxchar);
9435 if (decimal == NULL)
9436 return decimal;
9437 kind = PyUnicode_KIND(decimal);
9438 data = PyUnicode_DATA(decimal);
9439 /* Iterate over code points */
9440 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009441 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009442 if (ch > 127) {
9443 int decimal = Py_UNICODE_TODECIMAL(ch);
9444 if (decimal >= 0)
9445 ch = '0' + decimal;
9446 }
9447 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009449 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009450}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009451/* --- Decimal Encoder ---------------------------------------------------- */
9452
Alexander Belopolsky40018472011-02-26 01:02:56 +00009453int
9454PyUnicode_EncodeDecimal(Py_UNICODE *s,
9455 Py_ssize_t length,
9456 char *output,
9457 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009458{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009459 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009460 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009461 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009462 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009463
9464 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 PyErr_BadArgument();
9466 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009467 }
9468
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009469 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009470 if (unicode == NULL)
9471 return -1;
9472
Victor Stinner42bf7752011-11-21 22:52:58 +01009473 kind = PyUnicode_KIND(unicode);
9474 data = PyUnicode_DATA(unicode);
9475
Victor Stinnerb84d7232011-11-22 01:50:07 +01009476 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009477 PyObject *exc;
9478 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009480 Py_ssize_t startpos;
9481
9482 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009483
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009485 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009486 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 decimal = Py_UNICODE_TODECIMAL(ch);
9490 if (decimal >= 0) {
9491 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009492 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 continue;
9494 }
9495 if (0 < ch && ch < 256) {
9496 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009497 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 continue;
9499 }
Victor Stinner6345be92011-11-25 20:09:01 +01009500
Victor Stinner42bf7752011-11-21 22:52:58 +01009501 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009502 exc = NULL;
9503 raise_encode_exception(&exc, "decimal", unicode,
9504 startpos, startpos+1,
9505 "invalid decimal Unicode string");
9506 Py_XDECREF(exc);
9507 Py_DECREF(unicode);
9508 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009509 }
9510 /* 0-terminate the output string */
9511 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009512 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009513 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009514}
9515
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516/* --- Helpers ------------------------------------------------------------ */
9517
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009518/* helper macro to fixup start/end slice values */
9519#define ADJUST_INDICES(start, end, len) \
9520 if (end > len) \
9521 end = len; \
9522 else if (end < 0) { \
9523 end += len; \
9524 if (end < 0) \
9525 end = 0; \
9526 } \
9527 if (start < 0) { \
9528 start += len; \
9529 if (start < 0) \
9530 start = 0; \
9531 }
9532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009534any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009536 Py_ssize_t end,
9537 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009539 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009540 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 Py_ssize_t len1, len2, result;
9542
9543 kind1 = PyUnicode_KIND(s1);
9544 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009545 if (kind1 < kind2)
9546 return -1;
9547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 len1 = PyUnicode_GET_LENGTH(s1);
9549 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009550 ADJUST_INDICES(start, end, len1);
9551 if (end - start < len2)
9552 return -1;
9553
9554 buf1 = PyUnicode_DATA(s1);
9555 buf2 = PyUnicode_DATA(s2);
9556 if (len2 == 1) {
9557 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9558 result = findchar((const char *)buf1 + kind1*start,
9559 kind1, end - start, ch, direction);
9560 if (result == -1)
9561 return -1;
9562 else
9563 return start + result;
9564 }
9565
9566 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009567 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009568 if (!buf2)
9569 return -2;
9570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571
Victor Stinner794d5672011-10-10 03:21:36 +02009572 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009573 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009574 case PyUnicode_1BYTE_KIND:
9575 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9576 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9577 else
9578 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9579 break;
9580 case PyUnicode_2BYTE_KIND:
9581 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9582 break;
9583 case PyUnicode_4BYTE_KIND:
9584 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9585 break;
9586 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009587 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009588 }
9589 }
9590 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009591 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009592 case PyUnicode_1BYTE_KIND:
9593 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9594 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9595 else
9596 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9597 break;
9598 case PyUnicode_2BYTE_KIND:
9599 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9600 break;
9601 case PyUnicode_4BYTE_KIND:
9602 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9603 break;
9604 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009605 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 }
9608
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009609 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009610 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009611 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612
9613 return result;
9614}
9615
Victor Stinner59423e32018-11-26 13:40:01 +01009616/* _PyUnicode_InsertThousandsGrouping() helper functions */
9617#include "stringlib/localeutil.h"
9618
9619/**
9620 * InsertThousandsGrouping:
9621 * @writer: Unicode writer.
9622 * @n_buffer: Number of characters in @buffer.
9623 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9624 * @d_pos: Start of digits string.
9625 * @n_digits: The number of digits in the string, in which we want
9626 * to put the grouping chars.
9627 * @min_width: The minimum width of the digits in the output string.
9628 * Output will be zero-padded on the left to fill.
9629 * @grouping: see definition in localeconv().
9630 * @thousands_sep: see definition in localeconv().
9631 *
9632 * There are 2 modes: counting and filling. If @writer is NULL,
9633 * we are in counting mode, else filling mode.
9634 * If counting, the required buffer size is returned.
9635 * If filling, we know the buffer will be large enough, so we don't
9636 * need to pass in the buffer size.
9637 * Inserts thousand grouping characters (as defined by grouping and
9638 * thousands_sep) into @writer.
9639 *
9640 * Return value: -1 on error, number of characters otherwise.
9641 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009643_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009644 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009645 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009646 PyObject *digits,
9647 Py_ssize_t d_pos,
9648 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009649 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009650 const char *grouping,
9651 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009652 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653{
Xtreak3f7983a2019-01-07 20:39:14 +05309654 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009655 if (writer) {
9656 assert(digits != NULL);
9657 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009658 }
9659 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009660 assert(digits == NULL);
9661 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009662 }
Victor Stinner59423e32018-11-26 13:40:01 +01009663 assert(0 <= d_pos);
9664 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009665 assert(grouping != NULL);
9666
9667 if (digits != NULL) {
9668 if (PyUnicode_READY(digits) == -1) {
9669 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009670 }
Victor Stinner59423e32018-11-26 13:40:01 +01009671 }
9672 if (PyUnicode_READY(thousands_sep) == -1) {
9673 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009674 }
9675
Victor Stinner59423e32018-11-26 13:40:01 +01009676 Py_ssize_t count = 0;
9677 Py_ssize_t n_zeros;
9678 int loop_broken = 0;
9679 int use_separator = 0; /* First time through, don't append the
9680 separator. They only go between
9681 groups. */
9682 Py_ssize_t buffer_pos;
9683 Py_ssize_t digits_pos;
9684 Py_ssize_t len;
9685 Py_ssize_t n_chars;
9686 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9687 be looked at */
9688 /* A generator that returns all of the grouping widths, until it
9689 returns 0. */
9690 GroupGenerator groupgen;
9691 GroupGenerator_init(&groupgen, grouping);
9692 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9693
9694 /* if digits are not grouped, thousands separator
9695 should be an empty string */
9696 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9697
9698 digits_pos = d_pos + n_digits;
9699 if (writer) {
9700 buffer_pos = writer->pos + n_buffer;
9701 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9702 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 }
Victor Stinner59423e32018-11-26 13:40:01 +01009704 else {
9705 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009706 }
Victor Stinner59423e32018-11-26 13:40:01 +01009707
9708 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009709 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009710 }
Victor Stinner59423e32018-11-26 13:40:01 +01009711
9712 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9713 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9714 n_zeros = Py_MAX(0, len - remaining);
9715 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9716
9717 /* Use n_zero zero's and n_chars chars */
9718
9719 /* Count only, don't do anything. */
9720 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9721
9722 /* Copy into the writer. */
9723 InsertThousandsGrouping_fill(writer, &buffer_pos,
9724 digits, &digits_pos,
9725 n_chars, n_zeros,
9726 use_separator ? thousands_sep : NULL,
9727 thousands_sep_len, maxchar);
9728
9729 /* Use a separator next time. */
9730 use_separator = 1;
9731
9732 remaining -= n_chars;
9733 min_width -= len;
9734
9735 if (remaining <= 0 && min_width <= 0) {
9736 loop_broken = 1;
9737 break;
9738 }
9739 min_width -= thousands_sep_len;
9740 }
9741 if (!loop_broken) {
9742 /* We left the loop without using a break statement. */
9743
9744 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9745 n_zeros = Py_MAX(0, len - remaining);
9746 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9747
9748 /* Use n_zero zero's and n_chars chars */
9749 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9750
9751 /* Copy into the writer. */
9752 InsertThousandsGrouping_fill(writer, &buffer_pos,
9753 digits, &digits_pos,
9754 n_chars, n_zeros,
9755 use_separator ? thousands_sep : NULL,
9756 thousands_sep_len, maxchar);
9757 }
9758 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759}
9760
9761
Alexander Belopolsky40018472011-02-26 01:02:56 +00009762Py_ssize_t
9763PyUnicode_Count(PyObject *str,
9764 PyObject *substr,
9765 Py_ssize_t start,
9766 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009768 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009769 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009770 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009772
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009773 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009774 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009775
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009776 kind1 = PyUnicode_KIND(str);
9777 kind2 = PyUnicode_KIND(substr);
9778 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009779 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009780
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009781 len1 = PyUnicode_GET_LENGTH(str);
9782 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009784 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009785 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009786
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009787 buf1 = PyUnicode_DATA(str);
9788 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009789 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009790 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009791 if (!buf2)
9792 goto onError;
9793 }
9794
9795 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009797 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009798 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009799 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009800 buf2, len2, PY_SSIZE_T_MAX
9801 );
9802 else
9803 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009804 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009805 buf2, len2, PY_SSIZE_T_MAX
9806 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 break;
9808 case PyUnicode_2BYTE_KIND:
9809 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009810 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 buf2, len2, PY_SSIZE_T_MAX
9812 );
9813 break;
9814 case PyUnicode_4BYTE_KIND:
9815 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009816 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 buf2, len2, PY_SSIZE_T_MAX
9818 );
9819 break;
9820 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009821 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009823
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009824 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009825 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009826 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009830 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9831 if (kind2 != kind1)
9832 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834}
9835
Alexander Belopolsky40018472011-02-26 01:02:56 +00009836Py_ssize_t
9837PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009838 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009839 Py_ssize_t start,
9840 Py_ssize_t end,
9841 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009843 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009844 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009845
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009846 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847}
9848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849Py_ssize_t
9850PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9851 Py_ssize_t start, Py_ssize_t end,
9852 int direction)
9853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009855 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 if (PyUnicode_READY(str) == -1)
9857 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009858 len = PyUnicode_GET_LENGTH(str);
9859 ADJUST_INDICES(start, end, len);
9860 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009861 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009863 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9864 kind, end-start, ch, direction);
9865 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009867 else
9868 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869}
9870
Alexander Belopolsky40018472011-02-26 01:02:56 +00009871static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009872tailmatch(PyObject *self,
9873 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009874 Py_ssize_t start,
9875 Py_ssize_t end,
9876 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 int kind_self;
9879 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009880 const void *data_self;
9881 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 Py_ssize_t offset;
9883 Py_ssize_t i;
9884 Py_ssize_t end_sub;
9885
9886 if (PyUnicode_READY(self) == -1 ||
9887 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009888 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9891 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009893 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009895 if (PyUnicode_GET_LENGTH(substring) == 0)
9896 return 1;
9897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 kind_self = PyUnicode_KIND(self);
9899 data_self = PyUnicode_DATA(self);
9900 kind_sub = PyUnicode_KIND(substring);
9901 data_sub = PyUnicode_DATA(substring);
9902 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9903
9904 if (direction > 0)
9905 offset = end;
9906 else
9907 offset = start;
9908
9909 if (PyUnicode_READ(kind_self, data_self, offset) ==
9910 PyUnicode_READ(kind_sub, data_sub, 0) &&
9911 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9912 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9913 /* If both are of the same kind, memcmp is sufficient */
9914 if (kind_self == kind_sub) {
9915 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009916 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 data_sub,
9918 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009919 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009921 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 else {
9923 /* We do not need to compare 0 and len(substring)-1 because
9924 the if statement above ensured already that they are equal
9925 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 for (i = 1; i < end_sub; ++i) {
9927 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9928 PyUnicode_READ(kind_sub, data_sub, i))
9929 return 0;
9930 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933 }
9934
9935 return 0;
9936}
9937
Alexander Belopolsky40018472011-02-26 01:02:56 +00009938Py_ssize_t
9939PyUnicode_Tailmatch(PyObject *str,
9940 PyObject *substr,
9941 Py_ssize_t start,
9942 Py_ssize_t end,
9943 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009945 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009946 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009948 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949}
9950
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009951static PyObject *
9952ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009954 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009955 const char *data = PyUnicode_DATA(self);
9956 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009957 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009958
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009959 res = PyUnicode_New(len, 127);
9960 if (res == NULL)
9961 return NULL;
9962 resdata = PyUnicode_DATA(res);
9963 if (lower)
9964 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009966 _Py_bytes_upper(resdata, data, len);
9967 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968}
9969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009971handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009973 Py_ssize_t j;
9974 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009975 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009976 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009977
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009978 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9979
9980 where ! is a negation and \p{xxx} is a character with property xxx.
9981 */
9982 for (j = i - 1; j >= 0; j--) {
9983 c = PyUnicode_READ(kind, data, j);
9984 if (!_PyUnicode_IsCaseIgnorable(c))
9985 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009987 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9988 if (final_sigma) {
9989 for (j = i + 1; j < length; j++) {
9990 c = PyUnicode_READ(kind, data, j);
9991 if (!_PyUnicode_IsCaseIgnorable(c))
9992 break;
9993 }
9994 final_sigma = j == length || !_PyUnicode_IsCased(c);
9995 }
9996 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997}
9998
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009999static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010000lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010001 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010003 /* Obscure special case. */
10004 if (c == 0x3A3) {
10005 mapped[0] = handle_capital_sigma(kind, data, length, i);
10006 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010008 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009}
10010
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010011static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010012do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010014 Py_ssize_t i, k = 0;
10015 int n_res, j;
10016 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010017
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010018 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010019 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010020 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010021 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010022 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010024 for (i = 1; i < length; i++) {
10025 c = PyUnicode_READ(kind, data, i);
10026 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10027 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010028 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010029 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010030 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010031 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010032 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033}
10034
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010035static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010036do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010037 Py_ssize_t i, k = 0;
10038
10039 for (i = 0; i < length; i++) {
10040 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10041 int n_res, j;
10042 if (Py_UNICODE_ISUPPER(c)) {
10043 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10044 }
10045 else if (Py_UNICODE_ISLOWER(c)) {
10046 n_res = _PyUnicode_ToUpperFull(c, mapped);
10047 }
10048 else {
10049 n_res = 1;
10050 mapped[0] = c;
10051 }
10052 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010053 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010054 res[k++] = mapped[j];
10055 }
10056 }
10057 return k;
10058}
10059
10060static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010061do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010062 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010064 Py_ssize_t i, k = 0;
10065
10066 for (i = 0; i < length; i++) {
10067 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10068 int n_res, j;
10069 if (lower)
10070 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10071 else
10072 n_res = _PyUnicode_ToUpperFull(c, mapped);
10073 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010074 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010075 res[k++] = mapped[j];
10076 }
10077 }
10078 return k;
10079}
10080
10081static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010082do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010083{
10084 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10085}
10086
10087static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010088do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010089{
10090 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10091}
10092
Benjamin Petersone51757f2012-01-12 21:10:29 -050010093static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010094do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010095{
10096 Py_ssize_t i, k = 0;
10097
10098 for (i = 0; i < length; i++) {
10099 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10100 Py_UCS4 mapped[3];
10101 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10102 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010103 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010104 res[k++] = mapped[j];
10105 }
10106 }
10107 return k;
10108}
10109
10110static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010111do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010112{
10113 Py_ssize_t i, k = 0;
10114 int previous_is_cased;
10115
10116 previous_is_cased = 0;
10117 for (i = 0; i < length; i++) {
10118 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10119 Py_UCS4 mapped[3];
10120 int n_res, j;
10121
10122 if (previous_is_cased)
10123 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10124 else
10125 n_res = _PyUnicode_ToTitleFull(c, mapped);
10126
10127 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010128 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010129 res[k++] = mapped[j];
10130 }
10131
10132 previous_is_cased = _PyUnicode_IsCased(c);
10133 }
10134 return k;
10135}
10136
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010137static PyObject *
10138case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010139 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010140{
10141 PyObject *res = NULL;
10142 Py_ssize_t length, newlength = 0;
10143 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010144 const void *data;
10145 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010146 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10147
Benjamin Petersoneea48462012-01-16 14:28:50 -050010148 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010149
10150 kind = PyUnicode_KIND(self);
10151 data = PyUnicode_DATA(self);
10152 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010153 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010154 PyErr_SetString(PyExc_OverflowError, "string is too long");
10155 return NULL;
10156 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010157 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010158 if (tmp == NULL)
10159 return PyErr_NoMemory();
10160 newlength = perform(kind, data, length, tmp, &maxchar);
10161 res = PyUnicode_New(newlength, maxchar);
10162 if (res == NULL)
10163 goto leave;
10164 tmpend = tmp + newlength;
10165 outdata = PyUnicode_DATA(res);
10166 outkind = PyUnicode_KIND(res);
10167 switch (outkind) {
10168 case PyUnicode_1BYTE_KIND:
10169 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10170 break;
10171 case PyUnicode_2BYTE_KIND:
10172 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10173 break;
10174 case PyUnicode_4BYTE_KIND:
10175 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10176 break;
10177 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010178 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010179 }
10180 leave:
10181 PyMem_FREE(tmp);
10182 return res;
10183}
10184
Tim Peters8ce9f162004-08-27 01:49:32 +000010185PyObject *
10186PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010188 PyObject *res;
10189 PyObject *fseq;
10190 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010191 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010193 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010194 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010195 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010196 }
10197
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010198 /* NOTE: the following code can't call back into Python code,
10199 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010200 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010201
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010202 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010203 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010204 res = _PyUnicode_JoinArray(separator, items, seqlen);
10205 Py_DECREF(fseq);
10206 return res;
10207}
10208
10209PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010210_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010211{
10212 PyObject *res = NULL; /* the result */
10213 PyObject *sep = NULL;
10214 Py_ssize_t seplen;
10215 PyObject *item;
10216 Py_ssize_t sz, i, res_offset;
10217 Py_UCS4 maxchar;
10218 Py_UCS4 item_maxchar;
10219 int use_memcpy;
10220 unsigned char *res_data = NULL, *sep_data = NULL;
10221 PyObject *last_obj;
10222 unsigned int kind = 0;
10223
Tim Peters05eba1f2004-08-27 21:32:02 +000010224 /* If empty sequence, return u"". */
10225 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010226 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010227 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010228
Tim Peters05eba1f2004-08-27 21:32:02 +000010229 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010230 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010231 if (seqlen == 1) {
10232 if (PyUnicode_CheckExact(items[0])) {
10233 res = items[0];
10234 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010235 return res;
10236 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010237 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010238 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010239 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010240 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010241 /* Set up sep and seplen */
10242 if (separator == NULL) {
10243 /* fall back to a blank space separator */
10244 sep = PyUnicode_FromOrdinal(' ');
10245 if (!sep)
10246 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010247 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010248 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010249 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010250 else {
10251 if (!PyUnicode_Check(separator)) {
10252 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010253 "separator: expected str instance,"
10254 " %.80s found",
10255 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010256 goto onError;
10257 }
10258 if (PyUnicode_READY(separator))
10259 goto onError;
10260 sep = separator;
10261 seplen = PyUnicode_GET_LENGTH(separator);
10262 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10263 /* inc refcount to keep this code path symmetric with the
10264 above case of a blank separator */
10265 Py_INCREF(sep);
10266 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010267 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010268 }
10269
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010270 /* There are at least two things to join, or else we have a subclass
10271 * of str in the sequence.
10272 * Do a pre-pass to figure out the total amount of space we'll
10273 * need (sz), and see whether all argument are strings.
10274 */
10275 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010276#ifdef Py_DEBUG
10277 use_memcpy = 0;
10278#else
10279 use_memcpy = 1;
10280#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010281 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010282 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010283 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010284 if (!PyUnicode_Check(item)) {
10285 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010286 "sequence item %zd: expected str instance,"
10287 " %.80s found",
10288 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 goto onError;
10290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 if (PyUnicode_READY(item) == -1)
10292 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010293 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010295 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010296 if (i != 0) {
10297 add_sz += seplen;
10298 }
10299 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010300 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010302 goto onError;
10303 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010304 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010305 if (use_memcpy && last_obj != NULL) {
10306 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10307 use_memcpy = 0;
10308 }
10309 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010310 }
Tim Petersced69f82003-09-16 20:30:58 +000010311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010313 if (res == NULL)
10314 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010315
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010316 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010317#ifdef Py_DEBUG
10318 use_memcpy = 0;
10319#else
10320 if (use_memcpy) {
10321 res_data = PyUnicode_1BYTE_DATA(res);
10322 kind = PyUnicode_KIND(res);
10323 if (seplen != 0)
10324 sep_data = PyUnicode_1BYTE_DATA(sep);
10325 }
10326#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010327 if (use_memcpy) {
10328 for (i = 0; i < seqlen; ++i) {
10329 Py_ssize_t itemlen;
10330 item = items[i];
10331
10332 /* Copy item, and maybe the separator. */
10333 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010334 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010335 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010336 kind * seplen);
10337 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010338 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010339
10340 itemlen = PyUnicode_GET_LENGTH(item);
10341 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010342 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010343 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010344 kind * itemlen);
10345 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010346 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010347 }
10348 assert(res_data == PyUnicode_1BYTE_DATA(res)
10349 + kind * PyUnicode_GET_LENGTH(res));
10350 }
10351 else {
10352 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10353 Py_ssize_t itemlen;
10354 item = items[i];
10355
10356 /* Copy item, and maybe the separator. */
10357 if (i && seplen != 0) {
10358 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10359 res_offset += seplen;
10360 }
10361
10362 itemlen = PyUnicode_GET_LENGTH(item);
10363 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010364 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010365 res_offset += itemlen;
10366 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010367 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010368 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010369 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010372 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
Benjamin Peterson29060642009-01-31 22:14:21 +000010375 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010377 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378 return NULL;
10379}
10380
Victor Stinnerd3f08822012-05-29 12:57:52 +020010381void
10382_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10383 Py_UCS4 fill_char)
10384{
10385 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010386 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010387 assert(PyUnicode_IS_READY(unicode));
10388 assert(unicode_modifiable(unicode));
10389 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10390 assert(start >= 0);
10391 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010392 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010393}
10394
Victor Stinner3fe55312012-01-04 00:33:50 +010010395Py_ssize_t
10396PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10397 Py_UCS4 fill_char)
10398{
10399 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010400
10401 if (!PyUnicode_Check(unicode)) {
10402 PyErr_BadInternalCall();
10403 return -1;
10404 }
10405 if (PyUnicode_READY(unicode) == -1)
10406 return -1;
10407 if (unicode_check_modifiable(unicode))
10408 return -1;
10409
Victor Stinnerd3f08822012-05-29 12:57:52 +020010410 if (start < 0) {
10411 PyErr_SetString(PyExc_IndexError, "string index out of range");
10412 return -1;
10413 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010414 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10415 PyErr_SetString(PyExc_ValueError,
10416 "fill character is bigger than "
10417 "the string maximum character");
10418 return -1;
10419 }
10420
10421 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10422 length = Py_MIN(maxlen, length);
10423 if (length <= 0)
10424 return 0;
10425
Victor Stinnerd3f08822012-05-29 12:57:52 +020010426 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010427 return length;
10428}
10429
Victor Stinner9310abb2011-10-05 00:59:23 +020010430static PyObject *
10431pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010432 Py_ssize_t left,
10433 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 PyObject *u;
10437 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010438 int kind;
10439 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
10441 if (left < 0)
10442 left = 0;
10443 if (right < 0)
10444 right = 0;
10445
Victor Stinnerc4b49542011-12-11 22:44:26 +010010446 if (left == 0 && right == 0)
10447 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10450 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010451 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10452 return NULL;
10453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010455 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010457 if (!u)
10458 return NULL;
10459
10460 kind = PyUnicode_KIND(u);
10461 data = PyUnicode_DATA(u);
10462 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010463 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010464 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010465 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010466 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010467 assert(_PyUnicode_CheckConsistency(u, 1));
10468 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469}
10470
Alexander Belopolsky40018472011-02-26 01:02:56 +000010471PyObject *
10472PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010476 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478
Benjamin Petersonead6b532011-12-20 17:23:42 -060010479 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481 if (PyUnicode_IS_ASCII(string))
10482 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010483 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010484 PyUnicode_GET_LENGTH(string), keepends);
10485 else
10486 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010487 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010488 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 break;
10490 case PyUnicode_2BYTE_KIND:
10491 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010492 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 PyUnicode_GET_LENGTH(string), keepends);
10494 break;
10495 case PyUnicode_4BYTE_KIND:
10496 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010497 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 PyUnicode_GET_LENGTH(string), keepends);
10499 break;
10500 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010501 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504}
10505
Alexander Belopolsky40018472011-02-26 01:02:56 +000010506static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010507split(PyObject *self,
10508 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010509 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010511 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010512 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 Py_ssize_t len1, len2;
10514 PyObject* out;
10515
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010517 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (PyUnicode_READY(self) == -1)
10520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010523 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010525 if (PyUnicode_IS_ASCII(self))
10526 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010527 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010528 PyUnicode_GET_LENGTH(self), maxcount
10529 );
10530 else
10531 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010532 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010533 PyUnicode_GET_LENGTH(self), maxcount
10534 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 case PyUnicode_2BYTE_KIND:
10536 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010537 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 PyUnicode_GET_LENGTH(self), maxcount
10539 );
10540 case PyUnicode_4BYTE_KIND:
10541 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010542 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 PyUnicode_GET_LENGTH(self), maxcount
10544 );
10545 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010546 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 }
10548
10549 if (PyUnicode_READY(substring) == -1)
10550 return NULL;
10551
10552 kind1 = PyUnicode_KIND(self);
10553 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 len1 = PyUnicode_GET_LENGTH(self);
10555 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010556 if (kind1 < kind2 || len1 < len2) {
10557 out = PyList_New(1);
10558 if (out == NULL)
10559 return NULL;
10560 Py_INCREF(self);
10561 PyList_SET_ITEM(out, 0, self);
10562 return out;
10563 }
10564 buf1 = PyUnicode_DATA(self);
10565 buf2 = PyUnicode_DATA(substring);
10566 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010567 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010568 if (!buf2)
10569 return NULL;
10570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010572 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010574 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10575 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010576 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010577 else
10578 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010579 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 break;
10581 case PyUnicode_2BYTE_KIND:
10582 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010583 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 break;
10585 case PyUnicode_4BYTE_KIND:
10586 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010587 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 break;
10589 default:
10590 out = NULL;
10591 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010592 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010593 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010594 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596}
10597
Alexander Belopolsky40018472011-02-26 01:02:56 +000010598static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010599rsplit(PyObject *self,
10600 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010601 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010602{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010603 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010604 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 Py_ssize_t len1, len2;
10606 PyObject* out;
10607
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010608 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010609 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (PyUnicode_READY(self) == -1)
10612 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010615 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010617 if (PyUnicode_IS_ASCII(self))
10618 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010619 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010620 PyUnicode_GET_LENGTH(self), maxcount
10621 );
10622 else
10623 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010624 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010625 PyUnicode_GET_LENGTH(self), maxcount
10626 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 case PyUnicode_2BYTE_KIND:
10628 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010629 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 PyUnicode_GET_LENGTH(self), maxcount
10631 );
10632 case PyUnicode_4BYTE_KIND:
10633 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010634 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 PyUnicode_GET_LENGTH(self), maxcount
10636 );
10637 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010638 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 }
10640
10641 if (PyUnicode_READY(substring) == -1)
10642 return NULL;
10643
10644 kind1 = PyUnicode_KIND(self);
10645 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 len1 = PyUnicode_GET_LENGTH(self);
10647 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010648 if (kind1 < kind2 || len1 < len2) {
10649 out = PyList_New(1);
10650 if (out == NULL)
10651 return NULL;
10652 Py_INCREF(self);
10653 PyList_SET_ITEM(out, 0, self);
10654 return out;
10655 }
10656 buf1 = PyUnicode_DATA(self);
10657 buf2 = PyUnicode_DATA(substring);
10658 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010659 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010660 if (!buf2)
10661 return NULL;
10662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010664 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010666 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10667 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010668 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010669 else
10670 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010671 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 break;
10673 case PyUnicode_2BYTE_KIND:
10674 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010675 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 break;
10677 case PyUnicode_4BYTE_KIND:
10678 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010679 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 break;
10681 default:
10682 out = NULL;
10683 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010684 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010685 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010686 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 return out;
10688}
10689
10690static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010691anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10692 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010694 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010696 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10697 return asciilib_find(buf1, len1, buf2, len2, offset);
10698 else
10699 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 case PyUnicode_2BYTE_KIND:
10701 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10702 case PyUnicode_4BYTE_KIND:
10703 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10704 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010705 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706}
10707
10708static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010709anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10710 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010712 switch (kind) {
10713 case PyUnicode_1BYTE_KIND:
10714 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10715 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10716 else
10717 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10718 case PyUnicode_2BYTE_KIND:
10719 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10720 case PyUnicode_4BYTE_KIND:
10721 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10722 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010723 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010724}
10725
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010726static void
10727replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10728 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10729{
10730 int kind = PyUnicode_KIND(u);
10731 void *data = PyUnicode_DATA(u);
10732 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10733 if (kind == PyUnicode_1BYTE_KIND) {
10734 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10735 (Py_UCS1 *)data + len,
10736 u1, u2, maxcount);
10737 }
10738 else if (kind == PyUnicode_2BYTE_KIND) {
10739 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10740 (Py_UCS2 *)data + len,
10741 u1, u2, maxcount);
10742 }
10743 else {
10744 assert(kind == PyUnicode_4BYTE_KIND);
10745 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10746 (Py_UCS4 *)data + len,
10747 u1, u2, maxcount);
10748 }
10749}
10750
Alexander Belopolsky40018472011-02-26 01:02:56 +000010751static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752replace(PyObject *self, PyObject *str1,
10753 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010756 const char *sbuf = PyUnicode_DATA(self);
10757 const void *buf1 = PyUnicode_DATA(str1);
10758 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 int srelease = 0, release1 = 0, release2 = 0;
10760 int skind = PyUnicode_KIND(self);
10761 int kind1 = PyUnicode_KIND(str1);
10762 int kind2 = PyUnicode_KIND(str2);
10763 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10764 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10765 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010766 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010767 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010769 if (slen < len1)
10770 goto nothing;
10771
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010774 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010775 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776
Victor Stinner59de0ee2011-10-07 10:01:28 +020010777 if (str1 == str2)
10778 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779
Victor Stinner49a0a212011-10-12 23:46:10 +020010780 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010781 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10782 if (maxchar < maxchar_str1)
10783 /* substring too wide to be present */
10784 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010785 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10786 /* Replacing str1 with str2 may cause a maxchar reduction in the
10787 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010788 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010789 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010792 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010794 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010796 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010797 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010798 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010799
Victor Stinner69ed0f42013-04-09 21:48:24 +020010800 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010801 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010802 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010803 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010804 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010806 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010808
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010809 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10810 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010811 }
10812 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 int rkind = skind;
10814 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010815 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (kind1 < rkind) {
10818 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010819 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 if (!buf1) goto error;
10821 release1 = 1;
10822 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010823 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010824 if (i < 0)
10825 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 if (rkind > kind2) {
10827 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010828 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 if (!buf2) goto error;
10830 release2 = 1;
10831 }
10832 else if (rkind < kind2) {
10833 /* widen self and buf1 */
10834 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010835 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010836 assert(buf1 != PyUnicode_DATA(str1));
10837 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010838 buf1 = PyUnicode_DATA(str1);
10839 release1 = 0;
10840 }
10841 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 if (!sbuf) goto error;
10843 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010844 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 if (!buf1) goto error;
10846 release1 = 1;
10847 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010848 u = PyUnicode_New(slen, maxchar);
10849 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010851 assert(PyUnicode_KIND(u) == rkind);
10852 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010853
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010854 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010855 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010856 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010858 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010860
10861 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010862 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010863 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010864 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010865 if (i == -1)
10866 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010867 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010869 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010873 }
10874 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010876 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 int rkind = skind;
10878 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010881 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010882 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 if (!buf1) goto error;
10884 release1 = 1;
10885 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010886 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 if (n == 0)
10888 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010890 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010891 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (!buf2) goto error;
10893 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010896 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010898 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 if (!sbuf) goto error;
10900 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010901 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010902 assert(buf1 != PyUnicode_DATA(str1));
10903 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010904 buf1 = PyUnicode_DATA(str1);
10905 release1 = 0;
10906 }
10907 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 if (!buf1) goto error;
10909 release1 = 1;
10910 }
10911 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10912 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010913 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 PyErr_SetString(PyExc_OverflowError,
10915 "replace string is too long");
10916 goto error;
10917 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010918 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010919 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010920 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010921 goto done;
10922 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010923 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 PyErr_SetString(PyExc_OverflowError,
10925 "replace string is too long");
10926 goto error;
10927 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010928 u = PyUnicode_New(new_size, maxchar);
10929 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010931 assert(PyUnicode_KIND(u) == rkind);
10932 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 ires = i = 0;
10934 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010935 while (n-- > 0) {
10936 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010937 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010938 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010939 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010940 if (j == -1)
10941 break;
10942 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010943 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010944 memcpy(res + rkind * ires,
10945 sbuf + rkind * i,
10946 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010948 }
10949 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010951 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010953 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010959 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010960 memcpy(res + rkind * ires,
10961 sbuf + rkind * i,
10962 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010963 }
10964 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965 /* interleave */
10966 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010967 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010969 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971 if (--n <= 0)
10972 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010973 memcpy(res + rkind * ires,
10974 sbuf + rkind * i,
10975 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 ires++;
10977 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010978 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010979 memcpy(res + rkind * ires,
10980 sbuf + rkind * i,
10981 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010982 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010983 }
10984
10985 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010986 unicode_adjust_maxchar(&u);
10987 if (u == NULL)
10988 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010990
10991 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010992 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10993 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10994 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010996 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010998 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011000 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011001 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011003
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011006 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11007 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11008 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011010 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011012 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011014 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011015 return unicode_result_unchanged(self);
11016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011018 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11019 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11020 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11021 if (srelease)
11022 PyMem_FREE((void *)sbuf);
11023 if (release1)
11024 PyMem_FREE((void *)buf1);
11025 if (release2)
11026 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028}
11029
11030/* --- Unicode Object Methods --------------------------------------------- */
11031
INADA Naoki3ae20562017-01-16 20:41:20 +090011032/*[clinic input]
11033str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
INADA Naoki3ae20562017-01-16 20:41:20 +090011035Return a version of the string where each word is titlecased.
11036
11037More specifically, words start with uppercased characters and all remaining
11038cased characters have lower case.
11039[clinic start generated code]*/
11040
11041static PyObject *
11042unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011043/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011045 if (PyUnicode_READY(self) == -1)
11046 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011047 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048}
11049
INADA Naoki3ae20562017-01-16 20:41:20 +090011050/*[clinic input]
11051str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052
INADA Naoki3ae20562017-01-16 20:41:20 +090011053Return a capitalized version of the string.
11054
11055More specifically, make the first character have upper case and the rest lower
11056case.
11057[clinic start generated code]*/
11058
11059static PyObject *
11060unicode_capitalize_impl(PyObject *self)
11061/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011063 if (PyUnicode_READY(self) == -1)
11064 return NULL;
11065 if (PyUnicode_GET_LENGTH(self) == 0)
11066 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011067 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068}
11069
INADA Naoki3ae20562017-01-16 20:41:20 +090011070/*[clinic input]
11071str.casefold as unicode_casefold
11072
11073Return a version of the string suitable for caseless comparisons.
11074[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011075
11076static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011077unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011078/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011079{
11080 if (PyUnicode_READY(self) == -1)
11081 return NULL;
11082 if (PyUnicode_IS_ASCII(self))
11083 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011084 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011085}
11086
11087
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011088/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011089
11090static int
11091convert_uc(PyObject *obj, void *addr)
11092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011094
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011095 if (!PyUnicode_Check(obj)) {
11096 PyErr_Format(PyExc_TypeError,
11097 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011098 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011099 return 0;
11100 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011101 if (PyUnicode_READY(obj) < 0)
11102 return 0;
11103 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011104 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011105 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011106 return 0;
11107 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011108 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011109 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011110}
11111
INADA Naoki3ae20562017-01-16 20:41:20 +090011112/*[clinic input]
11113str.center as unicode_center
11114
11115 width: Py_ssize_t
11116 fillchar: Py_UCS4 = ' '
11117 /
11118
11119Return a centered string of length width.
11120
11121Padding is done using the specified fill character (default is a space).
11122[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123
11124static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011125unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11126/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011128 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
Benjamin Petersonbac79492012-01-14 13:34:47 -050011130 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131 return NULL;
11132
Victor Stinnerc4b49542011-12-11 22:44:26 +010011133 if (PyUnicode_GET_LENGTH(self) >= width)
11134 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Victor Stinnerc4b49542011-12-11 22:44:26 +010011136 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 left = marg / 2 + (marg & width & 1);
11138
Victor Stinner9310abb2011-10-05 00:59:23 +020011139 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140}
11141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142/* This function assumes that str1 and str2 are readied by the caller. */
11143
Marc-André Lemburge5034372000-08-08 08:04:29 +000011144static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011145unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011146{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011147#define COMPARE(TYPE1, TYPE2) \
11148 do { \
11149 TYPE1* p1 = (TYPE1 *)data1; \
11150 TYPE2* p2 = (TYPE2 *)data2; \
11151 TYPE1* end = p1 + len; \
11152 Py_UCS4 c1, c2; \
11153 for (; p1 != end; p1++, p2++) { \
11154 c1 = *p1; \
11155 c2 = *p2; \
11156 if (c1 != c2) \
11157 return (c1 < c2) ? -1 : 1; \
11158 } \
11159 } \
11160 while (0)
11161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011163 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011164 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 kind1 = PyUnicode_KIND(str1);
11167 kind2 = PyUnicode_KIND(str2);
11168 data1 = PyUnicode_DATA(str1);
11169 data2 = PyUnicode_DATA(str2);
11170 len1 = PyUnicode_GET_LENGTH(str1);
11171 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011172 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011173
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011174 switch(kind1) {
11175 case PyUnicode_1BYTE_KIND:
11176 {
11177 switch(kind2) {
11178 case PyUnicode_1BYTE_KIND:
11179 {
11180 int cmp = memcmp(data1, data2, len);
11181 /* normalize result of memcmp() into the range [-1; 1] */
11182 if (cmp < 0)
11183 return -1;
11184 if (cmp > 0)
11185 return 1;
11186 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011187 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011188 case PyUnicode_2BYTE_KIND:
11189 COMPARE(Py_UCS1, Py_UCS2);
11190 break;
11191 case PyUnicode_4BYTE_KIND:
11192 COMPARE(Py_UCS1, Py_UCS4);
11193 break;
11194 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011195 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011196 }
11197 break;
11198 }
11199 case PyUnicode_2BYTE_KIND:
11200 {
11201 switch(kind2) {
11202 case PyUnicode_1BYTE_KIND:
11203 COMPARE(Py_UCS2, Py_UCS1);
11204 break;
11205 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011206 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011207 COMPARE(Py_UCS2, Py_UCS2);
11208 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011209 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011210 case PyUnicode_4BYTE_KIND:
11211 COMPARE(Py_UCS2, Py_UCS4);
11212 break;
11213 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011214 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011215 }
11216 break;
11217 }
11218 case PyUnicode_4BYTE_KIND:
11219 {
11220 switch(kind2) {
11221 case PyUnicode_1BYTE_KIND:
11222 COMPARE(Py_UCS4, Py_UCS1);
11223 break;
11224 case PyUnicode_2BYTE_KIND:
11225 COMPARE(Py_UCS4, Py_UCS2);
11226 break;
11227 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011228 {
11229#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11230 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11231 /* normalize result of wmemcmp() into the range [-1; 1] */
11232 if (cmp < 0)
11233 return -1;
11234 if (cmp > 0)
11235 return 1;
11236#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011237 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011238#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011239 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011240 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011241 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011242 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011243 }
11244 break;
11245 }
11246 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011247 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011248 }
11249
Victor Stinner770e19e2012-10-04 22:59:45 +020011250 if (len1 == len2)
11251 return 0;
11252 if (len1 < len2)
11253 return -1;
11254 else
11255 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011256
11257#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011258}
11259
Benjamin Peterson621b4302016-09-09 13:54:34 -070011260static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011261unicode_compare_eq(PyObject *str1, PyObject *str2)
11262{
11263 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011264 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011265 Py_ssize_t len;
11266 int cmp;
11267
Victor Stinnere5567ad2012-10-23 02:48:49 +020011268 len = PyUnicode_GET_LENGTH(str1);
11269 if (PyUnicode_GET_LENGTH(str2) != len)
11270 return 0;
11271 kind = PyUnicode_KIND(str1);
11272 if (PyUnicode_KIND(str2) != kind)
11273 return 0;
11274 data1 = PyUnicode_DATA(str1);
11275 data2 = PyUnicode_DATA(str2);
11276
11277 cmp = memcmp(data1, data2, len * kind);
11278 return (cmp == 0);
11279}
11280
11281
Alexander Belopolsky40018472011-02-26 01:02:56 +000011282int
11283PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11286 if (PyUnicode_READY(left) == -1 ||
11287 PyUnicode_READY(right) == -1)
11288 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011289
11290 /* a string is equal to itself */
11291 if (left == right)
11292 return 0;
11293
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011294 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011296 PyErr_Format(PyExc_TypeError,
11297 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011298 Py_TYPE(left)->tp_name,
11299 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 return -1;
11301}
11302
Martin v. Löwis5b222132007-06-10 09:51:05 +000011303int
11304PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 Py_ssize_t i;
11307 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011309 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310
Victor Stinner910337b2011-10-03 03:20:16 +020011311 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011312 if (!PyUnicode_IS_READY(uni)) {
11313 const wchar_t *ws = _PyUnicode_WSTR(uni);
11314 /* Compare Unicode string and source character set string */
11315 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11316 if (chr != ustr[i])
11317 return (chr < ustr[i]) ? -1 : 1;
11318 }
11319 /* This check keeps Python strings that end in '\0' from comparing equal
11320 to C strings identical up to that point. */
11321 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11322 return 1; /* uni is longer */
11323 if (ustr[i])
11324 return -1; /* str is longer */
11325 return 0;
11326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011328 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011329 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011330 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011331 size_t len, len2 = strlen(str);
11332 int cmp;
11333
11334 len = Py_MIN(len1, len2);
11335 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011336 if (cmp != 0) {
11337 if (cmp < 0)
11338 return -1;
11339 else
11340 return 1;
11341 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011342 if (len1 > len2)
11343 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011344 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011345 return -1; /* str is longer */
11346 return 0;
11347 }
11348 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011349 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011350 /* Compare Unicode string and source character set string */
11351 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011352 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011353 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11354 /* This check keeps Python strings that end in '\0' from comparing equal
11355 to C strings identical up to that point. */
11356 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11357 return 1; /* uni is longer */
11358 if (str[i])
11359 return -1; /* str is longer */
11360 return 0;
11361 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011362}
11363
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011364static int
11365non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11366{
11367 size_t i, len;
11368 const wchar_t *p;
11369 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11370 if (strlen(str) != len)
11371 return 0;
11372 p = _PyUnicode_WSTR(unicode);
11373 assert(p);
11374 for (i = 0; i < len; i++) {
11375 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011376 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011377 return 0;
11378 }
11379 return 1;
11380}
11381
11382int
11383_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11384{
11385 size_t len;
11386 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011387 assert(str);
11388#ifndef NDEBUG
11389 for (const char *p = str; *p; p++) {
11390 assert((unsigned char)*p < 128);
11391 }
11392#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011393 if (PyUnicode_READY(unicode) == -1) {
11394 /* Memory error or bad data */
11395 PyErr_Clear();
11396 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11397 }
11398 if (!PyUnicode_IS_ASCII(unicode))
11399 return 0;
11400 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11401 return strlen(str) == len &&
11402 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11403}
11404
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011405int
11406_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11407{
11408 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011409
11410 assert(_PyUnicode_CHECK(left));
11411 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011412#ifndef NDEBUG
11413 for (const char *p = right->string; *p; p++) {
11414 assert((unsigned char)*p < 128);
11415 }
11416#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011417
11418 if (PyUnicode_READY(left) == -1) {
11419 /* memory error or bad data */
11420 PyErr_Clear();
11421 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11422 }
11423
11424 if (!PyUnicode_IS_ASCII(left))
11425 return 0;
11426
11427 right_uni = _PyUnicode_FromId(right); /* borrowed */
11428 if (right_uni == NULL) {
11429 /* memory error or bad data */
11430 PyErr_Clear();
11431 return _PyUnicode_EqualToASCIIString(left, right->string);
11432 }
11433
11434 if (left == right_uni)
11435 return 1;
11436
11437 if (PyUnicode_CHECK_INTERNED(left))
11438 return 0;
11439
Victor Stinner607b1022020-05-05 18:50:30 +020011440#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011441 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011442 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011443 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11444 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011445#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011446
11447 return unicode_compare_eq(left, right_uni);
11448}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011449
Alexander Belopolsky40018472011-02-26 01:02:56 +000011450PyObject *
11451PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011452{
11453 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011454
Victor Stinnere5567ad2012-10-23 02:48:49 +020011455 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11456 Py_RETURN_NOTIMPLEMENTED;
11457
11458 if (PyUnicode_READY(left) == -1 ||
11459 PyUnicode_READY(right) == -1)
11460 return NULL;
11461
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011462 if (left == right) {
11463 switch (op) {
11464 case Py_EQ:
11465 case Py_LE:
11466 case Py_GE:
11467 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011468 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011469 case Py_NE:
11470 case Py_LT:
11471 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011472 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011473 default:
11474 PyErr_BadArgument();
11475 return NULL;
11476 }
11477 }
11478 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011479 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011480 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011481 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011482 }
11483 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011484 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011485 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011486 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011487}
11488
Alexander Belopolsky40018472011-02-26 01:02:56 +000011489int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011490_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11491{
11492 return unicode_eq(aa, bb);
11493}
11494
11495int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011496PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011497{
Victor Stinner77282cb2013-04-14 19:22:47 +020011498 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011499 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011501 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011502
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011503 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011505 "'in <string>' requires string as left operand, not %.100s",
11506 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011507 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011508 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011509 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011510 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011511 if (ensure_unicode(str) < 0)
11512 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011515 kind2 = PyUnicode_KIND(substr);
11516 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011517 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011519 len2 = PyUnicode_GET_LENGTH(substr);
11520 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011521 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011522 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011524 if (len2 == 1) {
11525 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11526 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011527 return result;
11528 }
11529 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011530 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011531 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011532 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011533 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534
Victor Stinner77282cb2013-04-14 19:22:47 +020011535 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 case PyUnicode_1BYTE_KIND:
11537 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11538 break;
11539 case PyUnicode_2BYTE_KIND:
11540 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11541 break;
11542 case PyUnicode_4BYTE_KIND:
11543 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11544 break;
11545 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011546 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011548
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011549 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011550 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011551 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552
Guido van Rossum403d68b2000-03-13 15:55:09 +000011553 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011554}
11555
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556/* Concat to string or Unicode object giving a new Unicode object. */
11557
Alexander Belopolsky40018472011-02-26 01:02:56 +000011558PyObject *
11559PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011561 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011562 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011563 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011565 if (ensure_unicode(left) < 0)
11566 return NULL;
11567
11568 if (!PyUnicode_Check(right)) {
11569 PyErr_Format(PyExc_TypeError,
11570 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011571 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011572 return NULL;
11573 }
11574 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576
11577 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011578 PyObject *empty = unicode_get_empty(); // Borrowed reference
11579 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011580 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011581 }
11582 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011583 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011586 left_len = PyUnicode_GET_LENGTH(left);
11587 right_len = PyUnicode_GET_LENGTH(right);
11588 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011589 PyErr_SetString(PyExc_OverflowError,
11590 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011591 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011592 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011593 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011594
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011595 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11596 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011597 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011600 result = PyUnicode_New(new_len, maxchar);
11601 if (result == NULL)
11602 return NULL;
11603 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11604 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11605 assert(_PyUnicode_CheckConsistency(result, 1));
11606 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607}
11608
Walter Dörwald1ab83302007-05-18 17:15:44 +000011609void
Victor Stinner23e56682011-10-03 03:54:37 +020011610PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011611{
Victor Stinner23e56682011-10-03 03:54:37 +020011612 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011613 Py_UCS4 maxchar, maxchar2;
11614 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011615
11616 if (p_left == NULL) {
11617 if (!PyErr_Occurred())
11618 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011619 return;
11620 }
Victor Stinner23e56682011-10-03 03:54:37 +020011621 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011622 if (right == NULL || left == NULL
11623 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011624 if (!PyErr_Occurred())
11625 PyErr_BadInternalCall();
11626 goto error;
11627 }
11628
Benjamin Petersonbac79492012-01-14 13:34:47 -050011629 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011630 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011631 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011632 goto error;
11633
Victor Stinner488fa492011-12-12 00:01:39 +010011634 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011635 PyObject *empty = unicode_get_empty(); // Borrowed reference
11636 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011637 Py_DECREF(left);
11638 Py_INCREF(right);
11639 *p_left = right;
11640 return;
11641 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011642 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011643 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011644 }
Victor Stinner488fa492011-12-12 00:01:39 +010011645
11646 left_len = PyUnicode_GET_LENGTH(left);
11647 right_len = PyUnicode_GET_LENGTH(right);
11648 if (left_len > PY_SSIZE_T_MAX - right_len) {
11649 PyErr_SetString(PyExc_OverflowError,
11650 "strings are too large to concat");
11651 goto error;
11652 }
11653 new_len = left_len + right_len;
11654
11655 if (unicode_modifiable(left)
11656 && PyUnicode_CheckExact(right)
11657 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011658 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11659 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011660 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011661 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011662 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11663 {
11664 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011665 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011666 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011667
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011668 /* copy 'right' into the newly allocated area of 'left' */
11669 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011670 }
Victor Stinner488fa492011-12-12 00:01:39 +010011671 else {
11672 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11673 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011674 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011675
Victor Stinner488fa492011-12-12 00:01:39 +010011676 /* Concat the two Unicode strings */
11677 res = PyUnicode_New(new_len, maxchar);
11678 if (res == NULL)
11679 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011680 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11681 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011682 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011683 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011684 }
11685 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011686 return;
11687
11688error:
Victor Stinner488fa492011-12-12 00:01:39 +010011689 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011690}
11691
11692void
11693PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11694{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 PyUnicode_Append(pleft, right);
11696 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011697}
11698
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011699/*
11700Wraps stringlib_parse_args_finds() and additionally ensures that the
11701first argument is a unicode object.
11702*/
11703
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011704static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011705parse_args_finds_unicode(const char * function_name, PyObject *args,
11706 PyObject **substring,
11707 Py_ssize_t *start, Py_ssize_t *end)
11708{
11709 if(stringlib_parse_args_finds(function_name, args, substring,
11710 start, end)) {
11711 if (ensure_unicode(*substring) < 0)
11712 return 0;
11713 return 1;
11714 }
11715 return 0;
11716}
11717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011718PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011721Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011722string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
11725static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011726unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011728 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011729 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011730 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011732 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011733 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011736 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011737 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 kind1 = PyUnicode_KIND(self);
11740 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011741 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011742 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 len1 = PyUnicode_GET_LENGTH(self);
11745 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011747 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011748 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011749
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011750 buf1 = PyUnicode_DATA(self);
11751 buf2 = PyUnicode_DATA(substring);
11752 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011753 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011754 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011755 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011756 }
11757 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 case PyUnicode_1BYTE_KIND:
11759 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011760 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 buf2, len2, PY_SSIZE_T_MAX
11762 );
11763 break;
11764 case PyUnicode_2BYTE_KIND:
11765 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011766 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 buf2, len2, PY_SSIZE_T_MAX
11768 );
11769 break;
11770 case PyUnicode_4BYTE_KIND:
11771 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011772 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 buf2, len2, PY_SSIZE_T_MAX
11774 );
11775 break;
11776 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011777 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 }
11779
11780 result = PyLong_FromSsize_t(iresult);
11781
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011782 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011783 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011784 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 return result;
11787}
11788
INADA Naoki3ae20562017-01-16 20:41:20 +090011789/*[clinic input]
11790str.encode as unicode_encode
11791
11792 encoding: str(c_default="NULL") = 'utf-8'
11793 The encoding in which to encode the string.
11794 errors: str(c_default="NULL") = 'strict'
11795 The error handling scheme to use for encoding errors.
11796 The default is 'strict' meaning that encoding errors raise a
11797 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11798 'xmlcharrefreplace' as well as any other name registered with
11799 codecs.register_error that can handle UnicodeEncodeErrors.
11800
11801Encode the string using the codec registered for encoding.
11802[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
11804static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011805unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011806/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011808 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011809}
11810
INADA Naoki3ae20562017-01-16 20:41:20 +090011811/*[clinic input]
11812str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
INADA Naoki3ae20562017-01-16 20:41:20 +090011814 tabsize: int = 8
11815
11816Return a copy where all tab characters are expanded using spaces.
11817
11818If tabsize is not given, a tab size of 8 characters is assumed.
11819[clinic start generated code]*/
11820
11821static PyObject *
11822unicode_expandtabs_impl(PyObject *self, int tabsize)
11823/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011825 Py_ssize_t i, j, line_pos, src_len, incr;
11826 Py_UCS4 ch;
11827 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011828 const void *src_data;
11829 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011830 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011831 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832
Antoine Pitrou22425222011-10-04 19:10:51 +020011833 if (PyUnicode_READY(self) == -1)
11834 return NULL;
11835
Thomas Wouters7e474022000-07-16 12:04:32 +000011836 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011837 src_len = PyUnicode_GET_LENGTH(self);
11838 i = j = line_pos = 0;
11839 kind = PyUnicode_KIND(self);
11840 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011841 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011842 for (; i < src_len; i++) {
11843 ch = PyUnicode_READ(kind, src_data, i);
11844 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011845 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011847 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011849 goto overflow;
11850 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011852 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011855 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011856 goto overflow;
11857 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011859 if (ch == '\n' || ch == '\r')
11860 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011862 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011863 if (!found)
11864 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011865
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011867 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868 if (!u)
11869 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011870 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
Antoine Pitroue71d5742011-10-04 15:55:09 +020011872 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873
Antoine Pitroue71d5742011-10-04 15:55:09 +020011874 for (; i < src_len; i++) {
11875 ch = PyUnicode_READ(kind, src_data, i);
11876 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011878 incr = tabsize - (line_pos % tabsize);
11879 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011880 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011881 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011883 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011885 line_pos++;
11886 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011887 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011888 if (ch == '\n' || ch == '\r')
11889 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011891 }
11892 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011893 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011894
Antoine Pitroue71d5742011-10-04 15:55:09 +020011895 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011896 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898}
11899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011900PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902\n\
11903Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011904such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905arguments start and end are interpreted as in slice notation.\n\
11906\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011907Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908
11909static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011912 /* initialize variables to prevent gcc warning */
11913 PyObject *substring = NULL;
11914 Py_ssize_t start = 0;
11915 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011916 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011918 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011921 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011924 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (result == -2)
11927 return NULL;
11928
Christian Heimes217cfd12007-12-02 14:31:20 +000011929 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930}
11931
11932static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011933unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011935 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011936 enum PyUnicode_Kind kind;
11937 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011938
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011939 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011940 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011942 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011943 if (PyUnicode_READY(self) == -1) {
11944 return NULL;
11945 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011946 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11947 PyErr_SetString(PyExc_IndexError, "string index out of range");
11948 return NULL;
11949 }
11950 kind = PyUnicode_KIND(self);
11951 data = PyUnicode_DATA(self);
11952 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011953 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954}
11955
Guido van Rossumc2504932007-09-18 19:42:40 +000011956/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011957 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011958static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011959unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011961 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011962
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011963#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011964 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011965#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 if (_PyUnicode_HASH(self) != -1)
11967 return _PyUnicode_HASH(self);
11968 if (PyUnicode_READY(self) == -1)
11969 return -1;
animalizea1d14252019-01-02 20:16:06 +080011970
Christian Heimes985ecdc2013-11-20 11:46:18 +010011971 x = _Py_HashBytes(PyUnicode_DATA(self),
11972 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011974 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975}
11976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011977PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979\n\
oldkaa0735f2018-02-02 16:52:55 +080011980Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011981such that sub is contained within S[start:end]. Optional\n\
11982arguments start and end are interpreted as in slice notation.\n\
11983\n\
11984Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985
11986static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011989 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011990 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011991 PyObject *substring = NULL;
11992 Py_ssize_t start = 0;
11993 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011995 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011998 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012001 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (result == -2)
12004 return NULL;
12005
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006 if (result < 0) {
12007 PyErr_SetString(PyExc_ValueError, "substring not found");
12008 return NULL;
12009 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012010
Christian Heimes217cfd12007-12-02 14:31:20 +000012011 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012}
12013
INADA Naoki3ae20562017-01-16 20:41:20 +090012014/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012015str.isascii as unicode_isascii
12016
12017Return True if all characters in the string are ASCII, False otherwise.
12018
12019ASCII characters have code points in the range U+0000-U+007F.
12020Empty string is ASCII too.
12021[clinic start generated code]*/
12022
12023static PyObject *
12024unicode_isascii_impl(PyObject *self)
12025/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12026{
12027 if (PyUnicode_READY(self) == -1) {
12028 return NULL;
12029 }
12030 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12031}
12032
12033/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012034str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
INADA Naoki3ae20562017-01-16 20:41:20 +090012036Return True if the string is a lowercase string, False otherwise.
12037
12038A string is lowercase if all cased characters in the string are lowercase and
12039there is at least one cased character in the string.
12040[clinic start generated code]*/
12041
12042static PyObject *
12043unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012044/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 Py_ssize_t i, length;
12047 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012048 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 int cased;
12050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 if (PyUnicode_READY(self) == -1)
12052 return NULL;
12053 length = PyUnicode_GET_LENGTH(self);
12054 kind = PyUnicode_KIND(self);
12055 data = PyUnicode_DATA(self);
12056
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (length == 1)
12059 return PyBool_FromLong(
12060 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012062 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012064 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012065
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 for (i = 0; i < length; i++) {
12068 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012069
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012071 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 else if (!cased && Py_UNICODE_ISLOWER(ch))
12073 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012075 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076}
12077
INADA Naoki3ae20562017-01-16 20:41:20 +090012078/*[clinic input]
12079str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
INADA Naoki3ae20562017-01-16 20:41:20 +090012081Return True if the string is an uppercase string, False otherwise.
12082
12083A string is uppercase if all cased characters in the string are uppercase and
12084there is at least one cased character in the string.
12085[clinic start generated code]*/
12086
12087static PyObject *
12088unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012089/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 Py_ssize_t i, length;
12092 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012093 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 int cased;
12095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (PyUnicode_READY(self) == -1)
12097 return NULL;
12098 length = PyUnicode_GET_LENGTH(self);
12099 kind = PyUnicode_KIND(self);
12100 data = PyUnicode_DATA(self);
12101
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if (length == 1)
12104 return PyBool_FromLong(
12105 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012107 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012109 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012110
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 for (i = 0; i < length; i++) {
12113 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012114
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 else if (!cased && Py_UNICODE_ISUPPER(ch))
12118 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012120 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121}
12122
INADA Naoki3ae20562017-01-16 20:41:20 +090012123/*[clinic input]
12124str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125
INADA Naoki3ae20562017-01-16 20:41:20 +090012126Return True if the string is a title-cased string, False otherwise.
12127
12128In a title-cased string, upper- and title-case characters may only
12129follow uncased characters and lowercase characters only cased ones.
12130[clinic start generated code]*/
12131
12132static PyObject *
12133unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012134/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 Py_ssize_t i, length;
12137 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012138 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139 int cased, previous_is_cased;
12140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (PyUnicode_READY(self) == -1)
12142 return NULL;
12143 length = PyUnicode_GET_LENGTH(self);
12144 kind = PyUnicode_KIND(self);
12145 data = PyUnicode_DATA(self);
12146
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 if (length == 1) {
12149 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12150 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12151 (Py_UNICODE_ISUPPER(ch) != 0));
12152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012154 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012157
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158 cased = 0;
12159 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 for (i = 0; i < length; i++) {
12161 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012162
Benjamin Peterson29060642009-01-31 22:14:21 +000012163 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12164 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012165 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 previous_is_cased = 1;
12167 cased = 1;
12168 }
12169 else if (Py_UNICODE_ISLOWER(ch)) {
12170 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012171 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012172 previous_is_cased = 1;
12173 cased = 1;
12174 }
12175 else
12176 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012178 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179}
12180
INADA Naoki3ae20562017-01-16 20:41:20 +090012181/*[clinic input]
12182str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
INADA Naoki3ae20562017-01-16 20:41:20 +090012184Return True if the string is a whitespace string, False otherwise.
12185
12186A string is whitespace if all characters in the string are whitespace and there
12187is at least one character in the string.
12188[clinic start generated code]*/
12189
12190static PyObject *
12191unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012192/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 Py_ssize_t i, length;
12195 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012196 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197
12198 if (PyUnicode_READY(self) == -1)
12199 return NULL;
12200 length = PyUnicode_GET_LENGTH(self);
12201 kind = PyUnicode_KIND(self);
12202 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 if (length == 1)
12206 return PyBool_FromLong(
12207 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012209 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012211 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 for (i = 0; i < length; i++) {
12214 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012215 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012216 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012218 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219}
12220
INADA Naoki3ae20562017-01-16 20:41:20 +090012221/*[clinic input]
12222str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012223
INADA Naoki3ae20562017-01-16 20:41:20 +090012224Return True if the string is an alphabetic string, False otherwise.
12225
12226A string is alphabetic if all characters in the string are alphabetic and there
12227is at least one character in the string.
12228[clinic start generated code]*/
12229
12230static PyObject *
12231unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012232/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 Py_ssize_t i, length;
12235 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012236 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237
12238 if (PyUnicode_READY(self) == -1)
12239 return NULL;
12240 length = PyUnicode_GET_LENGTH(self);
12241 kind = PyUnicode_KIND(self);
12242 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012243
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012244 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 if (length == 1)
12246 return PyBool_FromLong(
12247 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012248
12249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012251 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 for (i = 0; i < length; i++) {
12254 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012255 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012256 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012257 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012258}
12259
INADA Naoki3ae20562017-01-16 20:41:20 +090012260/*[clinic input]
12261str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012262
INADA Naoki3ae20562017-01-16 20:41:20 +090012263Return True if the string is an alpha-numeric string, False otherwise.
12264
12265A string is alpha-numeric if all characters in the string are alpha-numeric and
12266there is at least one character in the string.
12267[clinic start generated code]*/
12268
12269static PyObject *
12270unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012271/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012274 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 Py_ssize_t len, i;
12276
12277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279
12280 kind = PyUnicode_KIND(self);
12281 data = PyUnicode_DATA(self);
12282 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012283
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012284 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 if (len == 1) {
12286 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12287 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12288 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012289
12290 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012292 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 for (i = 0; i < len; i++) {
12295 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012296 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012297 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012298 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012299 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012300}
12301
INADA Naoki3ae20562017-01-16 20:41:20 +090012302/*[clinic input]
12303str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304
INADA Naoki3ae20562017-01-16 20:41:20 +090012305Return True if the string is a decimal string, False otherwise.
12306
12307A string is a decimal string if all characters in the string are decimal and
12308there is at least one character in the string.
12309[clinic start generated code]*/
12310
12311static PyObject *
12312unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012313/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 Py_ssize_t i, length;
12316 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012317 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318
12319 if (PyUnicode_READY(self) == -1)
12320 return NULL;
12321 length = PyUnicode_GET_LENGTH(self);
12322 kind = PyUnicode_KIND(self);
12323 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 if (length == 1)
12327 return PyBool_FromLong(
12328 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012330 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012332 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 for (i = 0; i < length; i++) {
12335 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012336 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012338 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339}
12340
INADA Naoki3ae20562017-01-16 20:41:20 +090012341/*[clinic input]
12342str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343
INADA Naoki3ae20562017-01-16 20:41:20 +090012344Return True if the string is a digit string, False otherwise.
12345
12346A string is a digit string if all characters in the string are digits and there
12347is at least one character in the string.
12348[clinic start generated code]*/
12349
12350static PyObject *
12351unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012352/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 Py_ssize_t i, length;
12355 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012356 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357
12358 if (PyUnicode_READY(self) == -1)
12359 return NULL;
12360 length = PyUnicode_GET_LENGTH(self);
12361 kind = PyUnicode_KIND(self);
12362 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 if (length == 1) {
12366 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12367 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012370 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012372 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 for (i = 0; i < length; i++) {
12375 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012376 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012378 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379}
12380
INADA Naoki3ae20562017-01-16 20:41:20 +090012381/*[clinic input]
12382str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
INADA Naoki3ae20562017-01-16 20:41:20 +090012384Return True if the string is a numeric string, False otherwise.
12385
12386A string is numeric if all characters in the string are numeric and there is at
12387least one character in the string.
12388[clinic start generated code]*/
12389
12390static PyObject *
12391unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012392/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 Py_ssize_t i, length;
12395 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012396 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397
12398 if (PyUnicode_READY(self) == -1)
12399 return NULL;
12400 length = PyUnicode_GET_LENGTH(self);
12401 kind = PyUnicode_KIND(self);
12402 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 if (length == 1)
12406 return PyBool_FromLong(
12407 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012409 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012411 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 for (i = 0; i < length; i++) {
12414 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012415 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012417 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418}
12419
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012420Py_ssize_t
12421_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012422{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012424 if (PyUnicode_READY(self) == -1)
12425 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012426
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012427 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012428 if (len == 0) {
12429 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 }
12432
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012433 int kind = PyUnicode_KIND(self);
12434 const void *data = PyUnicode_DATA(self);
12435 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012436 /* PEP 3131 says that the first character must be in
12437 XID_Start and subsequent characters in XID_Continue,
12438 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012439 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012440 letters, digits, underscore). However, given the current
12441 definition of XID_Start and XID_Continue, it is sufficient
12442 to check just for these, except that _ must be allowed
12443 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012444 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012445 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012446 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012447
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012448 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012449 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012450 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012451 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012452 }
12453 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012454 return i;
12455}
12456
12457int
12458PyUnicode_IsIdentifier(PyObject *self)
12459{
12460 if (PyUnicode_IS_READY(self)) {
12461 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12462 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12463 /* an empty string is not a valid identifier */
12464 return len && i == len;
12465 }
12466 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012467_Py_COMP_DIAG_PUSH
12468_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012469 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012470 if (len == 0) {
12471 /* an empty string is not a valid identifier */
12472 return 0;
12473 }
12474
12475 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012476 Py_UCS4 ch = wstr[i++];
12477#if SIZEOF_WCHAR_T == 2
12478 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12479 && i < len
12480 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12481 {
12482 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12483 i++;
12484 }
12485#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012486 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12487 return 0;
12488 }
12489
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012490 while (i < len) {
12491 ch = wstr[i++];
12492#if SIZEOF_WCHAR_T == 2
12493 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12494 && i < len
12495 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12496 {
12497 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12498 i++;
12499 }
12500#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012501 if (!_PyUnicode_IsXidContinue(ch)) {
12502 return 0;
12503 }
12504 }
12505 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012506_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012507 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012508}
12509
INADA Naoki3ae20562017-01-16 20:41:20 +090012510/*[clinic input]
12511str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012512
INADA Naoki3ae20562017-01-16 20:41:20 +090012513Return True if the string is a valid Python identifier, False otherwise.
12514
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012515Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012516such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012517[clinic start generated code]*/
12518
12519static PyObject *
12520unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012521/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012522{
12523 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12524}
12525
INADA Naoki3ae20562017-01-16 20:41:20 +090012526/*[clinic input]
12527str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012528
INADA Naoki3ae20562017-01-16 20:41:20 +090012529Return True if the string is printable, False otherwise.
12530
12531A string is printable if all of its characters are considered printable in
12532repr() or if it is empty.
12533[clinic start generated code]*/
12534
12535static PyObject *
12536unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012537/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 Py_ssize_t i, length;
12540 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012541 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542
12543 if (PyUnicode_READY(self) == -1)
12544 return NULL;
12545 length = PyUnicode_GET_LENGTH(self);
12546 kind = PyUnicode_KIND(self);
12547 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012548
12549 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (length == 1)
12551 return PyBool_FromLong(
12552 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 for (i = 0; i < length; i++) {
12555 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012556 Py_RETURN_FALSE;
12557 }
12558 }
12559 Py_RETURN_TRUE;
12560}
12561
INADA Naoki3ae20562017-01-16 20:41:20 +090012562/*[clinic input]
12563str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
INADA Naoki3ae20562017-01-16 20:41:20 +090012565 iterable: object
12566 /
12567
12568Concatenate any number of strings.
12569
Martin Panter91a88662017-01-24 00:30:06 +000012570The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012571The result is returned as a new string.
12572
12573Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12574[clinic start generated code]*/
12575
12576static PyObject *
12577unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012578/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579{
INADA Naoki3ae20562017-01-16 20:41:20 +090012580 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
12582
Martin v. Löwis18e16552006-02-15 17:27:45 +000012583static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012584unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 if (PyUnicode_READY(self) == -1)
12587 return -1;
12588 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589}
12590
INADA Naoki3ae20562017-01-16 20:41:20 +090012591/*[clinic input]
12592str.ljust as unicode_ljust
12593
12594 width: Py_ssize_t
12595 fillchar: Py_UCS4 = ' '
12596 /
12597
12598Return a left-justified string of length width.
12599
12600Padding is done using the specified fill character (default is a space).
12601[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602
12603static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012604unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12605/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012607 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609
Victor Stinnerc4b49542011-12-11 22:44:26 +010012610 if (PyUnicode_GET_LENGTH(self) >= width)
12611 return unicode_result_unchanged(self);
12612
12613 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614}
12615
INADA Naoki3ae20562017-01-16 20:41:20 +090012616/*[clinic input]
12617str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618
INADA Naoki3ae20562017-01-16 20:41:20 +090012619Return a copy of the string converted to lowercase.
12620[clinic start generated code]*/
12621
12622static PyObject *
12623unicode_lower_impl(PyObject *self)
12624/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012626 if (PyUnicode_READY(self) == -1)
12627 return NULL;
12628 if (PyUnicode_IS_ASCII(self))
12629 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012630 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631}
12632
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012633#define LEFTSTRIP 0
12634#define RIGHTSTRIP 1
12635#define BOTHSTRIP 2
12636
12637/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012638static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012639
INADA Naoki3ae20562017-01-16 20:41:20 +090012640#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012641
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012642/* externally visible for str.strip(unicode) */
12643PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012644_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012645{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012646 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 int kind;
12648 Py_ssize_t i, j, len;
12649 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012650 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12653 return NULL;
12654
12655 kind = PyUnicode_KIND(self);
12656 data = PyUnicode_DATA(self);
12657 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012658 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12660 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012661 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012662
Benjamin Peterson14339b62009-01-31 16:36:08 +000012663 i = 0;
12664 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012665 while (i < len) {
12666 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12667 if (!BLOOM(sepmask, ch))
12668 break;
12669 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12670 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 i++;
12672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012673 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012674
Benjamin Peterson14339b62009-01-31 16:36:08 +000012675 j = len;
12676 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012677 j--;
12678 while (j >= i) {
12679 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12680 if (!BLOOM(sepmask, ch))
12681 break;
12682 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12683 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012685 }
12686
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012688 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012689
Victor Stinner7931d9a2011-11-04 00:22:48 +010012690 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691}
12692
12693PyObject*
12694PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12695{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012696 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012698 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699
Victor Stinnerde636f32011-10-01 03:55:54 +020012700 if (PyUnicode_READY(self) == -1)
12701 return NULL;
12702
Victor Stinner684d5fd2012-05-03 02:32:34 +020012703 length = PyUnicode_GET_LENGTH(self);
12704 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012705
Victor Stinner684d5fd2012-05-03 02:32:34 +020012706 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012707 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708
Victor Stinnerde636f32011-10-01 03:55:54 +020012709 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012710 PyErr_SetString(PyExc_IndexError, "string index out of range");
12711 return NULL;
12712 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012713 if (start >= length || end < start)
12714 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012715
Victor Stinner684d5fd2012-05-03 02:32:34 +020012716 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012717 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012718 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012719 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012720 }
12721 else {
12722 kind = PyUnicode_KIND(self);
12723 data = PyUnicode_1BYTE_DATA(self);
12724 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012725 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012726 length);
12727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729
12730static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012731do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 Py_ssize_t len, i, j;
12734
12735 if (PyUnicode_READY(self) == -1)
12736 return NULL;
12737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012739
Victor Stinnercc7af722013-04-09 22:39:24 +020012740 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012741 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012742
12743 i = 0;
12744 if (striptype != RIGHTSTRIP) {
12745 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012746 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012747 if (!_Py_ascii_whitespace[ch])
12748 break;
12749 i++;
12750 }
12751 }
12752
12753 j = len;
12754 if (striptype != LEFTSTRIP) {
12755 j--;
12756 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012757 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012758 if (!_Py_ascii_whitespace[ch])
12759 break;
12760 j--;
12761 }
12762 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 }
12764 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012765 else {
12766 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012767 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012768
Victor Stinnercc7af722013-04-09 22:39:24 +020012769 i = 0;
12770 if (striptype != RIGHTSTRIP) {
12771 while (i < len) {
12772 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12773 if (!Py_UNICODE_ISSPACE(ch))
12774 break;
12775 i++;
12776 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012777 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012778
12779 j = len;
12780 if (striptype != LEFTSTRIP) {
12781 j--;
12782 while (j >= i) {
12783 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12784 if (!Py_UNICODE_ISSPACE(ch))
12785 break;
12786 j--;
12787 }
12788 j++;
12789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012790 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012791
Victor Stinner7931d9a2011-11-04 00:22:48 +010012792 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793}
12794
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012795
12796static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012797do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012798{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012799 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012800 if (PyUnicode_Check(sep))
12801 return _PyUnicode_XStrip(self, striptype, sep);
12802 else {
12803 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012804 "%s arg must be None or str",
12805 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012806 return NULL;
12807 }
12808 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012809
Benjamin Peterson14339b62009-01-31 16:36:08 +000012810 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012811}
12812
12813
INADA Naoki3ae20562017-01-16 20:41:20 +090012814/*[clinic input]
12815str.strip as unicode_strip
12816
12817 chars: object = None
12818 /
12819
Zachary Ware09895c22019-10-09 16:09:00 -050012820Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012821
12822If chars is given and not None, remove characters in chars instead.
12823[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012824
12825static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012826unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012827/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012828{
INADA Naoki3ae20562017-01-16 20:41:20 +090012829 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012830}
12831
12832
INADA Naoki3ae20562017-01-16 20:41:20 +090012833/*[clinic input]
12834str.lstrip as unicode_lstrip
12835
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012836 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012837 /
12838
12839Return a copy of the string with leading whitespace removed.
12840
12841If chars is given and not None, remove characters in chars instead.
12842[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012843
12844static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012845unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012846/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012847{
INADA Naoki3ae20562017-01-16 20:41:20 +090012848 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012849}
12850
12851
INADA Naoki3ae20562017-01-16 20:41:20 +090012852/*[clinic input]
12853str.rstrip as unicode_rstrip
12854
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012855 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012856 /
12857
12858Return a copy of the string with trailing whitespace removed.
12859
12860If chars is given and not None, remove characters in chars instead.
12861[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012862
12863static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012864unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012865/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012866{
INADA Naoki3ae20562017-01-16 20:41:20 +090012867 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012868}
12869
12870
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012872unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012874 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876
Serhiy Storchaka05997252013-01-26 12:14:02 +020012877 if (len < 1)
12878 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879
Victor Stinnerc4b49542011-12-11 22:44:26 +010012880 /* no repeat, return original string */
12881 if (len == 1)
12882 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012883
Benjamin Petersonbac79492012-01-14 13:34:47 -050012884 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 return NULL;
12886
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012887 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012888 PyErr_SetString(PyExc_OverflowError,
12889 "repeated string is too long");
12890 return NULL;
12891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012893
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012894 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895 if (!u)
12896 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012897 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012900 int kind = PyUnicode_KIND(str);
12901 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012902 if (kind == PyUnicode_1BYTE_KIND) {
12903 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012904 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012905 }
12906 else if (kind == PyUnicode_2BYTE_KIND) {
12907 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012908 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012909 ucs2[n] = fill_char;
12910 } else {
12911 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12912 assert(kind == PyUnicode_4BYTE_KIND);
12913 for (n = 0; n < len; ++n)
12914 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 }
12917 else {
12918 /* number of characters copied this far */
12919 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012920 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012922 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012924 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012926 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929 }
12930
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012931 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012932 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933}
12934
Alexander Belopolsky40018472011-02-26 01:02:56 +000012935PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012936PyUnicode_Replace(PyObject *str,
12937 PyObject *substr,
12938 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012939 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012941 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12942 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012943 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012944 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945}
12946
INADA Naoki3ae20562017-01-16 20:41:20 +090012947/*[clinic input]
12948str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949
INADA Naoki3ae20562017-01-16 20:41:20 +090012950 old: unicode
12951 new: unicode
12952 count: Py_ssize_t = -1
12953 Maximum number of occurrences to replace.
12954 -1 (the default value) means replace all occurrences.
12955 /
12956
12957Return a copy with all occurrences of substring old replaced by new.
12958
12959If the optional argument count is given, only the first count occurrences are
12960replaced.
12961[clinic start generated code]*/
12962
12963static PyObject *
12964unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12965 Py_ssize_t count)
12966/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012968 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012969 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012970 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971}
12972
sweeneydea81849b2020-04-22 17:05:48 -040012973/*[clinic input]
12974str.removeprefix as unicode_removeprefix
12975
12976 prefix: unicode
12977 /
12978
12979Return a str with the given prefix string removed if present.
12980
12981If the string starts with the prefix string, return string[len(prefix):].
12982Otherwise, return a copy of the original string.
12983[clinic start generated code]*/
12984
12985static PyObject *
12986unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12987/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12988{
12989 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12990 if (match == -1) {
12991 return NULL;
12992 }
12993 if (match) {
12994 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12995 PyUnicode_GET_LENGTH(self));
12996 }
12997 return unicode_result_unchanged(self);
12998}
12999
13000/*[clinic input]
13001str.removesuffix as unicode_removesuffix
13002
13003 suffix: unicode
13004 /
13005
13006Return a str with the given suffix string removed if present.
13007
13008If the string ends with the suffix string and that suffix is not empty,
13009return string[:-len(suffix)]. Otherwise, return a copy of the original
13010string.
13011[clinic start generated code]*/
13012
13013static PyObject *
13014unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13015/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13016{
13017 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13018 if (match == -1) {
13019 return NULL;
13020 }
13021 if (match) {
13022 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13023 - PyUnicode_GET_LENGTH(suffix));
13024 }
13025 return unicode_result_unchanged(self);
13026}
13027
Alexander Belopolsky40018472011-02-26 01:02:56 +000013028static PyObject *
13029unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013030{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013031 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 Py_ssize_t isize;
13033 Py_ssize_t osize, squote, dquote, i, o;
13034 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013035 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013036 const void *idata;
13037 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013040 return NULL;
13041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 isize = PyUnicode_GET_LENGTH(unicode);
13043 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 /* Compute length of output, quote characters, and
13046 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013047 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 max = 127;
13049 squote = dquote = 0;
13050 ikind = PyUnicode_KIND(unicode);
13051 for (i = 0; i < isize; i++) {
13052 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013053 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013055 case '\'': squote++; break;
13056 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013058 incr = 2;
13059 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 default:
13061 /* Fast-path ASCII */
13062 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013063 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013065 ;
13066 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013069 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013071 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013073 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013075 if (osize > PY_SSIZE_T_MAX - incr) {
13076 PyErr_SetString(PyExc_OverflowError,
13077 "string is too long to generate repr");
13078 return NULL;
13079 }
13080 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 }
13082
13083 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013084 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013086 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 if (dquote)
13088 /* Both squote and dquote present. Use squote,
13089 and escape them */
13090 osize += squote;
13091 else
13092 quote = '"';
13093 }
Victor Stinner55c08782013-04-14 18:45:39 +020013094 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095
13096 repr = PyUnicode_New(osize, max);
13097 if (repr == NULL)
13098 return NULL;
13099 okind = PyUnicode_KIND(repr);
13100 odata = PyUnicode_DATA(repr);
13101
13102 PyUnicode_WRITE(okind, odata, 0, quote);
13103 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013104 if (unchanged) {
13105 _PyUnicode_FastCopyCharacters(repr, 1,
13106 unicode, 0,
13107 isize);
13108 }
13109 else {
13110 for (i = 0, o = 1; i < isize; i++) {
13111 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112
Victor Stinner55c08782013-04-14 18:45:39 +020013113 /* Escape quotes and backslashes */
13114 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013115 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013117 continue;
13118 }
13119
13120 /* Map special whitespace to '\t', \n', '\r' */
13121 if (ch == '\t') {
13122 PyUnicode_WRITE(okind, odata, o++, '\\');
13123 PyUnicode_WRITE(okind, odata, o++, 't');
13124 }
13125 else if (ch == '\n') {
13126 PyUnicode_WRITE(okind, odata, o++, '\\');
13127 PyUnicode_WRITE(okind, odata, o++, 'n');
13128 }
13129 else if (ch == '\r') {
13130 PyUnicode_WRITE(okind, odata, o++, '\\');
13131 PyUnicode_WRITE(okind, odata, o++, 'r');
13132 }
13133
13134 /* Map non-printable US ASCII to '\xhh' */
13135 else if (ch < ' ' || ch == 0x7F) {
13136 PyUnicode_WRITE(okind, odata, o++, '\\');
13137 PyUnicode_WRITE(okind, odata, o++, 'x');
13138 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13139 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13140 }
13141
13142 /* Copy ASCII characters as-is */
13143 else if (ch < 0x7F) {
13144 PyUnicode_WRITE(okind, odata, o++, ch);
13145 }
13146
13147 /* Non-ASCII characters */
13148 else {
13149 /* Map Unicode whitespace and control characters
13150 (categories Z* and C* except ASCII space)
13151 */
13152 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13153 PyUnicode_WRITE(okind, odata, o++, '\\');
13154 /* Map 8-bit characters to '\xhh' */
13155 if (ch <= 0xff) {
13156 PyUnicode_WRITE(okind, odata, o++, 'x');
13157 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13158 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13159 }
13160 /* Map 16-bit characters to '\uxxxx' */
13161 else if (ch <= 0xffff) {
13162 PyUnicode_WRITE(okind, odata, o++, 'u');
13163 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13164 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13165 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13166 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13167 }
13168 /* Map 21-bit characters to '\U00xxxxxx' */
13169 else {
13170 PyUnicode_WRITE(okind, odata, o++, 'U');
13171 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13172 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13173 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13174 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13175 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13176 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13177 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13178 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13179 }
13180 }
13181 /* Copy characters as-is */
13182 else {
13183 PyUnicode_WRITE(okind, odata, o++, ch);
13184 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013185 }
13186 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013189 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013190 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191}
13192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013193PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195\n\
13196Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013197such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198arguments start and end are interpreted as in slice notation.\n\
13199\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013200Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201
13202static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013205 /* initialize variables to prevent gcc warning */
13206 PyObject *substring = NULL;
13207 Py_ssize_t start = 0;
13208 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013209 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013211 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013214 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013217 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 if (result == -2)
13220 return NULL;
13221
Christian Heimes217cfd12007-12-02 14:31:20 +000013222 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223}
13224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013225PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013226 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013228Return the highest index in S where substring sub is found,\n\
13229such that sub is contained within S[start:end]. Optional\n\
13230arguments start and end are interpreted as in slice notation.\n\
13231\n\
13232Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233
13234static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013237 /* initialize variables to prevent gcc warning */
13238 PyObject *substring = NULL;
13239 Py_ssize_t start = 0;
13240 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013241 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013243 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013246 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013247 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013249 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013251 if (result == -2)
13252 return NULL;
13253
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254 if (result < 0) {
13255 PyErr_SetString(PyExc_ValueError, "substring not found");
13256 return NULL;
13257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258
Christian Heimes217cfd12007-12-02 14:31:20 +000013259 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260}
13261
INADA Naoki3ae20562017-01-16 20:41:20 +090013262/*[clinic input]
13263str.rjust as unicode_rjust
13264
13265 width: Py_ssize_t
13266 fillchar: Py_UCS4 = ' '
13267 /
13268
13269Return a right-justified string of length width.
13270
13271Padding is done using the specified fill character (default is a space).
13272[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273
13274static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013275unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13276/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013278 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279 return NULL;
13280
Victor Stinnerc4b49542011-12-11 22:44:26 +010013281 if (PyUnicode_GET_LENGTH(self) >= width)
13282 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283
Victor Stinnerc4b49542011-12-11 22:44:26 +010013284 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285}
13286
Alexander Belopolsky40018472011-02-26 01:02:56 +000013287PyObject *
13288PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013290 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013293 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294}
13295
INADA Naoki3ae20562017-01-16 20:41:20 +090013296/*[clinic input]
13297str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298
INADA Naoki3ae20562017-01-16 20:41:20 +090013299 sep: object = None
13300 The delimiter according which to split the string.
13301 None (the default value) means split according to any whitespace,
13302 and discard empty strings from the result.
13303 maxsplit: Py_ssize_t = -1
13304 Maximum number of splits to do.
13305 -1 (the default value) means no limit.
13306
13307Return a list of the words in the string, using sep as the delimiter string.
13308[clinic start generated code]*/
13309
13310static PyObject *
13311unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13312/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313{
INADA Naoki3ae20562017-01-16 20:41:20 +090013314 if (sep == Py_None)
13315 return split(self, NULL, maxsplit);
13316 if (PyUnicode_Check(sep))
13317 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013318
Victor Stinner998b8062018-09-12 00:23:25 +020013319 PyErr_Format(PyExc_TypeError,
13320 "must be str or None, not %.100s",
13321 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013323}
13324
Thomas Wouters477c8d52006-05-27 19:21:47 +000013325PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013326PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013327{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013328 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013329 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013330 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013332
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013333 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013335
Victor Stinner14f8f022011-10-05 20:58:25 +020013336 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 len1 = PyUnicode_GET_LENGTH(str_obj);
13339 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013340 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013341 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013342 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013343 }
13344 buf1 = PyUnicode_DATA(str_obj);
13345 buf2 = PyUnicode_DATA(sep_obj);
13346 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013347 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013348 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013349 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013352 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013354 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13355 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13356 else
13357 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 break;
13359 case PyUnicode_2BYTE_KIND:
13360 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13361 break;
13362 case PyUnicode_4BYTE_KIND:
13363 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13364 break;
13365 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013366 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013368
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013369 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013370 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013371 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013372
13373 return out;
13374}
13375
13376
13377PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013378PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013379{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013380 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013381 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013382 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013384
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013385 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013386 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013387
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013388 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 len1 = PyUnicode_GET_LENGTH(str_obj);
13391 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013392 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013393 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013394 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013395 }
13396 buf1 = PyUnicode_DATA(str_obj);
13397 buf2 = PyUnicode_DATA(sep_obj);
13398 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013399 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013400 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013401 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013403
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013404 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013406 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13407 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13408 else
13409 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 break;
13411 case PyUnicode_2BYTE_KIND:
13412 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13413 break;
13414 case PyUnicode_4BYTE_KIND:
13415 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13416 break;
13417 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013418 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013420
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013421 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013422 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013423 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013424
13425 return out;
13426}
13427
INADA Naoki3ae20562017-01-16 20:41:20 +090013428/*[clinic input]
13429str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013430
INADA Naoki3ae20562017-01-16 20:41:20 +090013431 sep: object
13432 /
13433
13434Partition the string into three parts using the given separator.
13435
13436This will search for the separator in the string. If the separator is found,
13437returns a 3-tuple containing the part before the separator, the separator
13438itself, and the part after it.
13439
13440If the separator is not found, returns a 3-tuple containing the original string
13441and two empty strings.
13442[clinic start generated code]*/
13443
13444static PyObject *
13445unicode_partition(PyObject *self, PyObject *sep)
13446/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013447{
INADA Naoki3ae20562017-01-16 20:41:20 +090013448 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013449}
13450
INADA Naoki3ae20562017-01-16 20:41:20 +090013451/*[clinic input]
13452str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013453
INADA Naoki3ae20562017-01-16 20:41:20 +090013454Partition the string into three parts using the given separator.
13455
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013456This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013457the separator is found, returns a 3-tuple containing the part before the
13458separator, the separator itself, and the part after it.
13459
13460If the separator is not found, returns a 3-tuple containing two empty strings
13461and the original string.
13462[clinic start generated code]*/
13463
13464static PyObject *
13465unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013466/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013467{
INADA Naoki3ae20562017-01-16 20:41:20 +090013468 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013469}
13470
Alexander Belopolsky40018472011-02-26 01:02:56 +000013471PyObject *
13472PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013473{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013474 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013475 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013476
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013477 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013478}
13479
INADA Naoki3ae20562017-01-16 20:41:20 +090013480/*[clinic input]
13481str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013482
INADA Naoki3ae20562017-01-16 20:41:20 +090013483Return a list of the words in the string, using sep as the delimiter string.
13484
13485Splits are done starting at the end of the string and working to the front.
13486[clinic start generated code]*/
13487
13488static PyObject *
13489unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13490/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013491{
INADA Naoki3ae20562017-01-16 20:41:20 +090013492 if (sep == Py_None)
13493 return rsplit(self, NULL, maxsplit);
13494 if (PyUnicode_Check(sep))
13495 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013496
Victor Stinner998b8062018-09-12 00:23:25 +020013497 PyErr_Format(PyExc_TypeError,
13498 "must be str or None, not %.100s",
13499 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013500 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013501}
13502
INADA Naoki3ae20562017-01-16 20:41:20 +090013503/*[clinic input]
13504str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013505
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013506 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013507
13508Return a list of the lines in the string, breaking at line boundaries.
13509
13510Line breaks are not included in the resulting list unless keepends is given and
13511true.
13512[clinic start generated code]*/
13513
13514static PyObject *
13515unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013516/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013517{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013518 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519}
13520
13521static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013522PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013524 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525}
13526
INADA Naoki3ae20562017-01-16 20:41:20 +090013527/*[clinic input]
13528str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013529
INADA Naoki3ae20562017-01-16 20:41:20 +090013530Convert uppercase characters to lowercase and lowercase characters to uppercase.
13531[clinic start generated code]*/
13532
13533static PyObject *
13534unicode_swapcase_impl(PyObject *self)
13535/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013536{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013537 if (PyUnicode_READY(self) == -1)
13538 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013539 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013540}
13541
Larry Hastings61272b72014-01-07 12:41:53 -080013542/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013543
Larry Hastings31826802013-10-19 00:09:25 -070013544@staticmethod
13545str.maketrans as unicode_maketrans
13546
13547 x: object
13548
13549 y: unicode=NULL
13550
13551 z: unicode=NULL
13552
13553 /
13554
13555Return a translation table usable for str.translate().
13556
13557If there is only one argument, it must be a dictionary mapping Unicode
13558ordinals (integers) or characters to Unicode ordinals, strings or None.
13559Character keys will be then converted to ordinals.
13560If there are two arguments, they must be strings of equal length, and
13561in the resulting dictionary, each character in x will be mapped to the
13562character at the same position in y. If there is a third argument, it
13563must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013564[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013565
Larry Hastings31826802013-10-19 00:09:25 -070013566static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013567unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013568/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013569{
Georg Brandlceee0772007-11-27 23:48:05 +000013570 PyObject *new = NULL, *key, *value;
13571 Py_ssize_t i = 0;
13572 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013573
Georg Brandlceee0772007-11-27 23:48:05 +000013574 new = PyDict_New();
13575 if (!new)
13576 return NULL;
13577 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013578 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013579 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580
Georg Brandlceee0772007-11-27 23:48:05 +000013581 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013582 if (!PyUnicode_Check(x)) {
13583 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13584 "be a string if there is a second argument");
13585 goto err;
13586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013587 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013588 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13589 "arguments must have equal length");
13590 goto err;
13591 }
13592 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013593 x_kind = PyUnicode_KIND(x);
13594 y_kind = PyUnicode_KIND(y);
13595 x_data = PyUnicode_DATA(x);
13596 y_data = PyUnicode_DATA(y);
13597 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13598 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013599 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013600 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013601 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013602 if (!value) {
13603 Py_DECREF(key);
13604 goto err;
13605 }
Georg Brandlceee0772007-11-27 23:48:05 +000013606 res = PyDict_SetItem(new, key, value);
13607 Py_DECREF(key);
13608 Py_DECREF(value);
13609 if (res < 0)
13610 goto err;
13611 }
13612 /* create entries for deleting chars in z */
13613 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 z_kind = PyUnicode_KIND(z);
13615 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013616 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013618 if (!key)
13619 goto err;
13620 res = PyDict_SetItem(new, key, Py_None);
13621 Py_DECREF(key);
13622 if (res < 0)
13623 goto err;
13624 }
13625 }
13626 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013627 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013628 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629
Georg Brandlceee0772007-11-27 23:48:05 +000013630 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013631 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013632 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13633 "to maketrans it must be a dict");
13634 goto err;
13635 }
13636 /* copy entries into the new dict, converting string keys to int keys */
13637 while (PyDict_Next(x, &i, &key, &value)) {
13638 if (PyUnicode_Check(key)) {
13639 /* convert string keys to integer keys */
13640 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013641 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013642 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13643 "table must be of length 1");
13644 goto err;
13645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013646 kind = PyUnicode_KIND(key);
13647 data = PyUnicode_DATA(key);
13648 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013649 if (!newkey)
13650 goto err;
13651 res = PyDict_SetItem(new, newkey, value);
13652 Py_DECREF(newkey);
13653 if (res < 0)
13654 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013655 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013656 /* just keep integer keys */
13657 if (PyDict_SetItem(new, key, value) < 0)
13658 goto err;
13659 } else {
13660 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13661 "be strings or integers");
13662 goto err;
13663 }
13664 }
13665 }
13666 return new;
13667 err:
13668 Py_DECREF(new);
13669 return NULL;
13670}
13671
INADA Naoki3ae20562017-01-16 20:41:20 +090013672/*[clinic input]
13673str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674
INADA Naoki3ae20562017-01-16 20:41:20 +090013675 table: object
13676 Translation table, which must be a mapping of Unicode ordinals to
13677 Unicode ordinals, strings, or None.
13678 /
13679
13680Replace each character in the string using the given translation table.
13681
13682The table must implement lookup/indexing via __getitem__, for instance a
13683dictionary or list. If this operation raises LookupError, the character is
13684left untouched. Characters mapped to None are deleted.
13685[clinic start generated code]*/
13686
13687static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013688unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013689/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013691 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692}
13693
INADA Naoki3ae20562017-01-16 20:41:20 +090013694/*[clinic input]
13695str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013696
INADA Naoki3ae20562017-01-16 20:41:20 +090013697Return a copy of the string converted to uppercase.
13698[clinic start generated code]*/
13699
13700static PyObject *
13701unicode_upper_impl(PyObject *self)
13702/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013703{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013704 if (PyUnicode_READY(self) == -1)
13705 return NULL;
13706 if (PyUnicode_IS_ASCII(self))
13707 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013708 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709}
13710
INADA Naoki3ae20562017-01-16 20:41:20 +090013711/*[clinic input]
13712str.zfill as unicode_zfill
13713
13714 width: Py_ssize_t
13715 /
13716
13717Pad a numeric string with zeros on the left, to fill a field of the given width.
13718
13719The string is never truncated.
13720[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013721
13722static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013723unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013724/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013725{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013726 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013727 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013728 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013729 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013730 Py_UCS4 chr;
13731
Benjamin Petersonbac79492012-01-14 13:34:47 -050013732 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013734
Victor Stinnerc4b49542011-12-11 22:44:26 +010013735 if (PyUnicode_GET_LENGTH(self) >= width)
13736 return unicode_result_unchanged(self);
13737
13738 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013739
13740 u = pad(self, fill, 0, '0');
13741
Walter Dörwald068325e2002-04-15 13:36:47 +000013742 if (u == NULL)
13743 return NULL;
13744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 kind = PyUnicode_KIND(u);
13746 data = PyUnicode_DATA(u);
13747 chr = PyUnicode_READ(kind, data, fill);
13748
13749 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013750 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013751 PyUnicode_WRITE(kind, data, 0, chr);
13752 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013753 }
13754
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013755 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013756 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013758
13759#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013760static PyObject *
13761unicode__decimal2ascii(PyObject *self)
13762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013763 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013764}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765#endif
13766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013767PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013768 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013769\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013770Return True if S starts with the specified prefix, False otherwise.\n\
13771With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013772With optional end, stop comparing S at that position.\n\
13773prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774
13775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013776unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013779 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013780 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013781 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013782 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013783 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013784
Jesus Ceaac451502011-04-20 17:09:23 +020013785 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013786 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013787 if (PyTuple_Check(subobj)) {
13788 Py_ssize_t i;
13789 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013790 substring = PyTuple_GET_ITEM(subobj, i);
13791 if (!PyUnicode_Check(substring)) {
13792 PyErr_Format(PyExc_TypeError,
13793 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013794 "not %.100s",
13795 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013796 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013797 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013798 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013799 if (result == -1)
13800 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013801 if (result) {
13802 Py_RETURN_TRUE;
13803 }
13804 }
13805 /* nothing matched */
13806 Py_RETURN_FALSE;
13807 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013808 if (!PyUnicode_Check(subobj)) {
13809 PyErr_Format(PyExc_TypeError,
13810 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013811 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013812 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013814 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013815 if (result == -1)
13816 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013817 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013818}
13819
13820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013821PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013824Return True if S ends with the specified suffix, False otherwise.\n\
13825With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013826With optional end, stop comparing S at that position.\n\
13827suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013828
13829static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013830unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013833 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013834 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013835 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013836 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013837 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838
Jesus Ceaac451502011-04-20 17:09:23 +020013839 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013841 if (PyTuple_Check(subobj)) {
13842 Py_ssize_t i;
13843 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013844 substring = PyTuple_GET_ITEM(subobj, i);
13845 if (!PyUnicode_Check(substring)) {
13846 PyErr_Format(PyExc_TypeError,
13847 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013848 "not %.100s",
13849 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013851 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013852 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013853 if (result == -1)
13854 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013855 if (result) {
13856 Py_RETURN_TRUE;
13857 }
13858 }
13859 Py_RETURN_FALSE;
13860 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013861 if (!PyUnicode_Check(subobj)) {
13862 PyErr_Format(PyExc_TypeError,
13863 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013864 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013865 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013866 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013867 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013868 if (result == -1)
13869 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013870 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871}
13872
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013873static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013874_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013875{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013876 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13877 writer->data = PyUnicode_DATA(writer->buffer);
13878
13879 if (!writer->readonly) {
13880 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013881 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013882 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013883 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013884 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13885 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13886 writer->kind = PyUnicode_WCHAR_KIND;
13887 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13888
Victor Stinner8f674cc2013-04-17 23:02:17 +020013889 /* Copy-on-write mode: set buffer size to 0 so
13890 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13891 * next write. */
13892 writer->size = 0;
13893 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013894}
13895
Victor Stinnerd3f08822012-05-29 12:57:52 +020013896void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013897_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013898{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013899 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013900
13901 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013902 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013903
13904 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13905 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13906 writer->kind = PyUnicode_WCHAR_KIND;
13907 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013908}
13909
Inada Naoki770847a2019-06-24 12:30:24 +090013910// Initialize _PyUnicodeWriter with initial buffer
13911static inline void
13912_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13913{
13914 memset(writer, 0, sizeof(*writer));
13915 writer->buffer = buffer;
13916 _PyUnicodeWriter_Update(writer);
13917 writer->min_length = writer->size;
13918}
13919
Victor Stinnerd3f08822012-05-29 12:57:52 +020013920int
13921_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13922 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013923{
13924 Py_ssize_t newlen;
13925 PyObject *newbuffer;
13926
Victor Stinner2740e462016-09-06 16:58:36 -070013927 assert(maxchar <= MAX_UNICODE);
13928
Victor Stinnerca9381e2015-09-22 00:58:32 +020013929 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013930 assert((maxchar > writer->maxchar && length >= 0)
13931 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013932
Victor Stinner202fdca2012-05-07 12:47:02 +020013933 if (length > PY_SSIZE_T_MAX - writer->pos) {
13934 PyErr_NoMemory();
13935 return -1;
13936 }
13937 newlen = writer->pos + length;
13938
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013939 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013940
Victor Stinnerd3f08822012-05-29 12:57:52 +020013941 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013942 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013943 if (writer->overallocate
13944 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13945 /* overallocate to limit the number of realloc() */
13946 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013947 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013948 if (newlen < writer->min_length)
13949 newlen = writer->min_length;
13950
Victor Stinnerd3f08822012-05-29 12:57:52 +020013951 writer->buffer = PyUnicode_New(newlen, maxchar);
13952 if (writer->buffer == NULL)
13953 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013954 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013955 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013956 if (writer->overallocate
13957 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13958 /* overallocate to limit the number of realloc() */
13959 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013960 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013961 if (newlen < writer->min_length)
13962 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013963
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013964 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013965 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013966 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013967 newbuffer = PyUnicode_New(newlen, maxchar);
13968 if (newbuffer == NULL)
13969 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013970 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13971 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013972 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013973 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013974 }
13975 else {
13976 newbuffer = resize_compact(writer->buffer, newlen);
13977 if (newbuffer == NULL)
13978 return -1;
13979 }
13980 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013981 }
13982 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013983 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013984 newbuffer = PyUnicode_New(writer->size, maxchar);
13985 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013986 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013987 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13988 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013989 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013990 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013991 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013992 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013993
13994#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013995}
13996
Victor Stinnerca9381e2015-09-22 00:58:32 +020013997int
13998_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13999 enum PyUnicode_Kind kind)
14000{
14001 Py_UCS4 maxchar;
14002
14003 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14004 assert(writer->kind < kind);
14005
14006 switch (kind)
14007 {
14008 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14009 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14010 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14011 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014012 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014013 }
14014
14015 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14016}
14017
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014018static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014019_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014020{
Victor Stinner2740e462016-09-06 16:58:36 -070014021 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014022 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14023 return -1;
14024 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14025 writer->pos++;
14026 return 0;
14027}
14028
14029int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014030_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14031{
14032 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14033}
14034
14035int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014036_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14037{
14038 Py_UCS4 maxchar;
14039 Py_ssize_t len;
14040
14041 if (PyUnicode_READY(str) == -1)
14042 return -1;
14043 len = PyUnicode_GET_LENGTH(str);
14044 if (len == 0)
14045 return 0;
14046 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14047 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014048 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014049 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014050 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014051 Py_INCREF(str);
14052 writer->buffer = str;
14053 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014054 writer->pos += len;
14055 return 0;
14056 }
14057 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14058 return -1;
14059 }
14060 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14061 str, 0, len);
14062 writer->pos += len;
14063 return 0;
14064}
14065
Victor Stinnere215d962012-10-06 23:03:36 +020014066int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014067_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14068 Py_ssize_t start, Py_ssize_t end)
14069{
14070 Py_UCS4 maxchar;
14071 Py_ssize_t len;
14072
14073 if (PyUnicode_READY(str) == -1)
14074 return -1;
14075
14076 assert(0 <= start);
14077 assert(end <= PyUnicode_GET_LENGTH(str));
14078 assert(start <= end);
14079
14080 if (end == 0)
14081 return 0;
14082
14083 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14084 return _PyUnicodeWriter_WriteStr(writer, str);
14085
14086 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14087 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14088 else
14089 maxchar = writer->maxchar;
14090 len = end - start;
14091
14092 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14093 return -1;
14094
14095 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14096 str, start, len);
14097 writer->pos += len;
14098 return 0;
14099}
14100
14101int
Victor Stinner4a587072013-11-19 12:54:53 +010014102_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14103 const char *ascii, Py_ssize_t len)
14104{
14105 if (len == -1)
14106 len = strlen(ascii);
14107
Andy Lestere6be9b52020-02-11 20:28:35 -060014108 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014109
14110 if (writer->buffer == NULL && !writer->overallocate) {
14111 PyObject *str;
14112
14113 str = _PyUnicode_FromASCII(ascii, len);
14114 if (str == NULL)
14115 return -1;
14116
14117 writer->readonly = 1;
14118 writer->buffer = str;
14119 _PyUnicodeWriter_Update(writer);
14120 writer->pos += len;
14121 return 0;
14122 }
14123
14124 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14125 return -1;
14126
14127 switch (writer->kind)
14128 {
14129 case PyUnicode_1BYTE_KIND:
14130 {
14131 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14132 Py_UCS1 *data = writer->data;
14133
Christian Heimesf051e432016-09-13 20:22:02 +020014134 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014135 break;
14136 }
14137 case PyUnicode_2BYTE_KIND:
14138 {
14139 _PyUnicode_CONVERT_BYTES(
14140 Py_UCS1, Py_UCS2,
14141 ascii, ascii + len,
14142 (Py_UCS2 *)writer->data + writer->pos);
14143 break;
14144 }
14145 case PyUnicode_4BYTE_KIND:
14146 {
14147 _PyUnicode_CONVERT_BYTES(
14148 Py_UCS1, Py_UCS4,
14149 ascii, ascii + len,
14150 (Py_UCS4 *)writer->data + writer->pos);
14151 break;
14152 }
14153 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014154 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014155 }
14156
14157 writer->pos += len;
14158 return 0;
14159}
14160
14161int
14162_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14163 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014164{
14165 Py_UCS4 maxchar;
14166
Andy Lestere6be9b52020-02-11 20:28:35 -060014167 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014168 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14169 return -1;
14170 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14171 writer->pos += len;
14172 return 0;
14173}
14174
Victor Stinnerd3f08822012-05-29 12:57:52 +020014175PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014176_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014177{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014178 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014179
Victor Stinnerd3f08822012-05-29 12:57:52 +020014180 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014181 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014183 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014184
14185 str = writer->buffer;
14186 writer->buffer = NULL;
14187
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014188 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014189 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14190 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014191 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014192
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014193 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14194 PyObject *str2;
14195 str2 = resize_compact(str, writer->pos);
14196 if (str2 == NULL) {
14197 Py_DECREF(str);
14198 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014199 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014200 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014201 }
14202
Victor Stinner15a0bd32013-07-08 22:29:55 +020014203 assert(_PyUnicode_CheckConsistency(str, 1));
14204 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014205}
14206
Victor Stinnerd3f08822012-05-29 12:57:52 +020014207void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014208_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014209{
14210 Py_CLEAR(writer->buffer);
14211}
14212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014213#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014214
14215PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014216 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014217\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014218Return a formatted version of S, using substitutions from args and kwargs.\n\
14219The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014220
Eric Smith27bbca62010-11-04 17:06:58 +000014221PyDoc_STRVAR(format_map__doc__,
14222 "S.format_map(mapping) -> str\n\
14223\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014224Return a formatted version of S, using substitutions from mapping.\n\
14225The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014226
INADA Naoki3ae20562017-01-16 20:41:20 +090014227/*[clinic input]
14228str.__format__ as unicode___format__
14229
14230 format_spec: unicode
14231 /
14232
14233Return a formatted version of the string as described by format_spec.
14234[clinic start generated code]*/
14235
Eric Smith4a7d76d2008-05-30 18:10:19 +000014236static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014237unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014238/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014239{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014240 _PyUnicodeWriter writer;
14241 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014242
Victor Stinnerd3f08822012-05-29 12:57:52 +020014243 if (PyUnicode_READY(self) == -1)
14244 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014245 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014246 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14247 self, format_spec, 0,
14248 PyUnicode_GET_LENGTH(format_spec));
14249 if (ret == -1) {
14250 _PyUnicodeWriter_Dealloc(&writer);
14251 return NULL;
14252 }
14253 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014254}
14255
INADA Naoki3ae20562017-01-16 20:41:20 +090014256/*[clinic input]
14257str.__sizeof__ as unicode_sizeof
14258
14259Return the size of the string in memory, in bytes.
14260[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014261
14262static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014263unicode_sizeof_impl(PyObject *self)
14264/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014266 Py_ssize_t size;
14267
14268 /* If it's a compact object, account for base structure +
14269 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014270 if (PyUnicode_IS_COMPACT_ASCII(self))
14271 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14272 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014273 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014274 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014275 else {
14276 /* If it is a two-block object, account for base object, and
14277 for character block if present. */
14278 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014279 if (_PyUnicode_DATA_ANY(self))
14280 size += (PyUnicode_GET_LENGTH(self) + 1) *
14281 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014282 }
14283 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014284 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014285 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14286 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14287 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14288 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014289
14290 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014291}
14292
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014293static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014294unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014295{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014296 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014297 if (!copy)
14298 return NULL;
14299 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014300}
14301
Guido van Rossumd57fd912000-03-10 22:53:23 +000014302static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014303 UNICODE_ENCODE_METHODDEF
14304 UNICODE_REPLACE_METHODDEF
14305 UNICODE_SPLIT_METHODDEF
14306 UNICODE_RSPLIT_METHODDEF
14307 UNICODE_JOIN_METHODDEF
14308 UNICODE_CAPITALIZE_METHODDEF
14309 UNICODE_CASEFOLD_METHODDEF
14310 UNICODE_TITLE_METHODDEF
14311 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014312 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014313 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014314 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014315 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014316 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014317 UNICODE_LJUST_METHODDEF
14318 UNICODE_LOWER_METHODDEF
14319 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014320 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14321 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014322 UNICODE_RJUST_METHODDEF
14323 UNICODE_RSTRIP_METHODDEF
14324 UNICODE_RPARTITION_METHODDEF
14325 UNICODE_SPLITLINES_METHODDEF
14326 UNICODE_STRIP_METHODDEF
14327 UNICODE_SWAPCASE_METHODDEF
14328 UNICODE_TRANSLATE_METHODDEF
14329 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014330 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14331 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014332 UNICODE_REMOVEPREFIX_METHODDEF
14333 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014334 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014335 UNICODE_ISLOWER_METHODDEF
14336 UNICODE_ISUPPER_METHODDEF
14337 UNICODE_ISTITLE_METHODDEF
14338 UNICODE_ISSPACE_METHODDEF
14339 UNICODE_ISDECIMAL_METHODDEF
14340 UNICODE_ISDIGIT_METHODDEF
14341 UNICODE_ISNUMERIC_METHODDEF
14342 UNICODE_ISALPHA_METHODDEF
14343 UNICODE_ISALNUM_METHODDEF
14344 UNICODE_ISIDENTIFIER_METHODDEF
14345 UNICODE_ISPRINTABLE_METHODDEF
14346 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014347 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014348 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014349 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014350 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014351 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014352#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014353 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014354 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014355#endif
14356
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014357 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014358 {NULL, NULL}
14359};
14360
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014361static PyObject *
14362unicode_mod(PyObject *v, PyObject *w)
14363{
Brian Curtindfc80e32011-08-10 20:28:54 -050014364 if (!PyUnicode_Check(v))
14365 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014366 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014367}
14368
14369static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014370 0, /*nb_add*/
14371 0, /*nb_subtract*/
14372 0, /*nb_multiply*/
14373 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014374};
14375
Guido van Rossumd57fd912000-03-10 22:53:23 +000014376static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 (lenfunc) unicode_length, /* sq_length */
14378 PyUnicode_Concat, /* sq_concat */
14379 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14380 (ssizeargfunc) unicode_getitem, /* sq_item */
14381 0, /* sq_slice */
14382 0, /* sq_ass_item */
14383 0, /* sq_ass_slice */
14384 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014385};
14386
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014387static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014388unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014390 if (PyUnicode_READY(self) == -1)
14391 return NULL;
14392
Victor Stinnera15e2602020-04-08 02:01:56 +020014393 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014394 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014395 if (i == -1 && PyErr_Occurred())
14396 return NULL;
14397 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014398 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014399 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014400 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014401 Py_ssize_t start, stop, step, slicelength, i;
14402 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014403 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014404 const void *src_data;
14405 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014406 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014407 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014408
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014409 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014410 return NULL;
14411 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014412 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14413 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014414
14415 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014416 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014417 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014418 slicelength == PyUnicode_GET_LENGTH(self)) {
14419 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014420 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014421 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014422 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014423 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014424 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014425 src_kind = PyUnicode_KIND(self);
14426 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014427 if (!PyUnicode_IS_ASCII(self)) {
14428 kind_limit = kind_maxchar_limit(src_kind);
14429 max_char = 0;
14430 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14431 ch = PyUnicode_READ(src_kind, src_data, cur);
14432 if (ch > max_char) {
14433 max_char = ch;
14434 if (max_char >= kind_limit)
14435 break;
14436 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014437 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014438 }
Victor Stinner55c99112011-10-13 01:17:06 +020014439 else
14440 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014441 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014442 if (result == NULL)
14443 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014444 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014445 dest_data = PyUnicode_DATA(result);
14446
14447 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014448 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14449 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014450 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014451 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014452 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014453 } else {
14454 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14455 return NULL;
14456 }
14457}
14458
14459static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 (lenfunc)unicode_length, /* mp_length */
14461 (binaryfunc)unicode_subscript, /* mp_subscript */
14462 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014463};
14464
Guido van Rossumd57fd912000-03-10 22:53:23 +000014465
Guido van Rossumd57fd912000-03-10 22:53:23 +000014466/* Helpers for PyUnicode_Format() */
14467
Victor Stinnera47082312012-10-04 02:19:54 +020014468struct unicode_formatter_t {
14469 PyObject *args;
14470 int args_owned;
14471 Py_ssize_t arglen, argidx;
14472 PyObject *dict;
14473
14474 enum PyUnicode_Kind fmtkind;
14475 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014476 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014477 PyObject *fmtstr;
14478
14479 _PyUnicodeWriter writer;
14480};
14481
14482struct unicode_format_arg_t {
14483 Py_UCS4 ch;
14484 int flags;
14485 Py_ssize_t width;
14486 int prec;
14487 int sign;
14488};
14489
Guido van Rossumd57fd912000-03-10 22:53:23 +000014490static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014491unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014492{
Victor Stinnera47082312012-10-04 02:19:54 +020014493 Py_ssize_t argidx = ctx->argidx;
14494
14495 if (argidx < ctx->arglen) {
14496 ctx->argidx++;
14497 if (ctx->arglen < 0)
14498 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014499 else
Victor Stinnera47082312012-10-04 02:19:54 +020014500 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014501 }
14502 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014503 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014504 return NULL;
14505}
14506
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014507/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014508
Victor Stinnera47082312012-10-04 02:19:54 +020014509/* Format a float into the writer if the writer is not NULL, or into *p_output
14510 otherwise.
14511
14512 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014513static int
Victor Stinnera47082312012-10-04 02:19:54 +020014514formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14515 PyObject **p_output,
14516 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014517{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014518 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014519 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014520 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014521 int prec;
14522 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014523
Guido van Rossumd57fd912000-03-10 22:53:23 +000014524 x = PyFloat_AsDouble(v);
14525 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014526 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014527
Victor Stinnera47082312012-10-04 02:19:54 +020014528 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014529 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014530 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014531
Victor Stinnera47082312012-10-04 02:19:54 +020014532 if (arg->flags & F_ALT)
14533 dtoa_flags = Py_DTSF_ALT;
14534 else
14535 dtoa_flags = 0;
14536 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014537 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014538 return -1;
14539 len = strlen(p);
14540 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014541 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014542 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014543 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014544 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014545 }
14546 else
14547 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014548 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014549 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014550}
14551
Victor Stinnerd0880d52012-04-27 23:40:13 +020014552/* formatlong() emulates the format codes d, u, o, x and X, and
14553 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14554 * Python's regular ints.
14555 * Return value: a new PyUnicodeObject*, or NULL if error.
14556 * The output string is of the form
14557 * "-"? ("0x" | "0X")? digit+
14558 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14559 * set in flags. The case of hex digits will be correct,
14560 * There will be at least prec digits, zero-filled on the left if
14561 * necessary to get that many.
14562 * val object to be converted
14563 * flags bitmask of format flags; only F_ALT is looked at
14564 * prec minimum number of digits; 0-fill on left if needed
14565 * type a character in [duoxX]; u acts the same as d
14566 *
14567 * CAUTION: o, x and X conversions on regular ints can never
14568 * produce a '-' sign, but can for Python's unbounded ints.
14569 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014570PyObject *
14571_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014572{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014573 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014574 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014575 Py_ssize_t i;
14576 int sign; /* 1 if '-', else 0 */
14577 int len; /* number of characters */
14578 Py_ssize_t llen;
14579 int numdigits; /* len == numnondigits + numdigits */
14580 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014581
Victor Stinnerd0880d52012-04-27 23:40:13 +020014582 /* Avoid exceeding SSIZE_T_MAX */
14583 if (prec > INT_MAX-3) {
14584 PyErr_SetString(PyExc_OverflowError,
14585 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014586 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014587 }
14588
14589 assert(PyLong_Check(val));
14590
14591 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014592 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014593 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014594 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014595 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014596 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014597 /* int and int subclasses should print numerically when a numeric */
14598 /* format code is used (see issue18780) */
14599 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014600 break;
14601 case 'o':
14602 numnondigits = 2;
14603 result = PyNumber_ToBase(val, 8);
14604 break;
14605 case 'x':
14606 case 'X':
14607 numnondigits = 2;
14608 result = PyNumber_ToBase(val, 16);
14609 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014610 }
14611 if (!result)
14612 return NULL;
14613
14614 assert(unicode_modifiable(result));
14615 assert(PyUnicode_IS_READY(result));
14616 assert(PyUnicode_IS_ASCII(result));
14617
14618 /* To modify the string in-place, there can only be one reference. */
14619 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014620 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014621 PyErr_BadInternalCall();
14622 return NULL;
14623 }
14624 buf = PyUnicode_DATA(result);
14625 llen = PyUnicode_GET_LENGTH(result);
14626 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014627 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014628 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014629 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014630 return NULL;
14631 }
14632 len = (int)llen;
14633 sign = buf[0] == '-';
14634 numnondigits += sign;
14635 numdigits = len - numnondigits;
14636 assert(numdigits > 0);
14637
14638 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014639 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014640 (type == 'o' || type == 'x' || type == 'X'))) {
14641 assert(buf[sign] == '0');
14642 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14643 buf[sign+1] == 'o');
14644 numnondigits -= 2;
14645 buf += 2;
14646 len -= 2;
14647 if (sign)
14648 buf[0] = '-';
14649 assert(len == numnondigits + numdigits);
14650 assert(numdigits > 0);
14651 }
14652
14653 /* Fill with leading zeroes to meet minimum width. */
14654 if (prec > numdigits) {
14655 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14656 numnondigits + prec);
14657 char *b1;
14658 if (!r1) {
14659 Py_DECREF(result);
14660 return NULL;
14661 }
14662 b1 = PyBytes_AS_STRING(r1);
14663 for (i = 0; i < numnondigits; ++i)
14664 *b1++ = *buf++;
14665 for (i = 0; i < prec - numdigits; i++)
14666 *b1++ = '0';
14667 for (i = 0; i < numdigits; i++)
14668 *b1++ = *buf++;
14669 *b1 = '\0';
14670 Py_DECREF(result);
14671 result = r1;
14672 buf = PyBytes_AS_STRING(result);
14673 len = numnondigits + prec;
14674 }
14675
14676 /* Fix up case for hex conversions. */
14677 if (type == 'X') {
14678 /* Need to convert all lower case letters to upper case.
14679 and need to convert 0x to 0X (and -0x to -0X). */
14680 for (i = 0; i < len; i++)
14681 if (buf[i] >= 'a' && buf[i] <= 'x')
14682 buf[i] -= 'a'-'A';
14683 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014684 if (!PyUnicode_Check(result)
14685 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014686 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014687 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014688 Py_DECREF(result);
14689 result = unicode;
14690 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014691 else if (len != PyUnicode_GET_LENGTH(result)) {
14692 if (PyUnicode_Resize(&result, len) < 0)
14693 Py_CLEAR(result);
14694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014695 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014696}
14697
Ethan Furmandf3ed242014-01-05 06:50:30 -080014698/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014699 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014700 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014701 * -1 and raise an exception on error */
14702static int
Victor Stinnera47082312012-10-04 02:19:54 +020014703mainformatlong(PyObject *v,
14704 struct unicode_format_arg_t *arg,
14705 PyObject **p_output,
14706 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014707{
14708 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014709 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014710
14711 if (!PyNumber_Check(v))
14712 goto wrongtype;
14713
Ethan Furman9ab74802014-03-21 06:38:46 -070014714 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014715 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014716 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014717 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014718 }
14719 else {
14720 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014721 }
14722 if (iobj == NULL ) {
14723 if (PyErr_ExceptionMatches(PyExc_TypeError))
14724 goto wrongtype;
14725 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014726 }
14727 assert(PyLong_Check(iobj));
14728 }
14729 else {
14730 iobj = v;
14731 Py_INCREF(iobj);
14732 }
14733
14734 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014735 && arg->width == -1 && arg->prec == -1
14736 && !(arg->flags & (F_SIGN | F_BLANK))
14737 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014738 {
14739 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014740 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014741 int base;
14742
Victor Stinnera47082312012-10-04 02:19:54 +020014743 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014744 {
14745 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014746 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014747 case 'd':
14748 case 'i':
14749 case 'u':
14750 base = 10;
14751 break;
14752 case 'o':
14753 base = 8;
14754 break;
14755 case 'x':
14756 case 'X':
14757 base = 16;
14758 break;
14759 }
14760
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014761 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14762 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014763 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014764 }
14765 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014766 return 1;
14767 }
14768
Ethan Furmanb95b5612015-01-23 20:05:18 -080014769 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014770 Py_DECREF(iobj);
14771 if (res == NULL)
14772 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014773 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014774 return 0;
14775
14776wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014777 switch(type)
14778 {
14779 case 'o':
14780 case 'x':
14781 case 'X':
14782 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014783 "%%%c format: an integer is required, "
14784 "not %.200s",
14785 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014786 break;
14787 default:
14788 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014789 "%%%c format: a number is required, "
14790 "not %.200s",
14791 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014792 break;
14793 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014794 return -1;
14795}
14796
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014797static Py_UCS4
14798formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014799{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014800 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014801 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014802 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014803 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014804 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014805 goto onError;
14806 }
14807 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014808 int overflow;
14809 long x = PyLong_AsLongAndOverflow(v, &overflow);
14810 if (x == -1 && PyErr_Occurred()) {
14811 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014812 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014813 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014814 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014815 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014816
Victor Stinner8faf8212011-12-08 22:14:11 +010014817 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014818 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014819 PyErr_SetString(PyExc_OverflowError,
14820 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014821 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014822 }
14823
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014824 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014825 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014826
Benjamin Peterson29060642009-01-31 22:14:21 +000014827 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014828 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014829 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014830 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014831}
14832
Victor Stinnera47082312012-10-04 02:19:54 +020014833/* Parse options of an argument: flags, width, precision.
14834 Handle also "%(name)" syntax.
14835
14836 Return 0 if the argument has been formatted into arg->str.
14837 Return 1 if the argument has been written into ctx->writer,
14838 Raise an exception and return -1 on error. */
14839static int
14840unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14841 struct unicode_format_arg_t *arg)
14842{
14843#define FORMAT_READ(ctx) \
14844 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14845
14846 PyObject *v;
14847
Victor Stinnera47082312012-10-04 02:19:54 +020014848 if (arg->ch == '(') {
14849 /* Get argument value from a dictionary. Example: "%(name)s". */
14850 Py_ssize_t keystart;
14851 Py_ssize_t keylen;
14852 PyObject *key;
14853 int pcount = 1;
14854
14855 if (ctx->dict == NULL) {
14856 PyErr_SetString(PyExc_TypeError,
14857 "format requires a mapping");
14858 return -1;
14859 }
14860 ++ctx->fmtpos;
14861 --ctx->fmtcnt;
14862 keystart = ctx->fmtpos;
14863 /* Skip over balanced parentheses */
14864 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14865 arg->ch = FORMAT_READ(ctx);
14866 if (arg->ch == ')')
14867 --pcount;
14868 else if (arg->ch == '(')
14869 ++pcount;
14870 ctx->fmtpos++;
14871 }
14872 keylen = ctx->fmtpos - keystart - 1;
14873 if (ctx->fmtcnt < 0 || pcount > 0) {
14874 PyErr_SetString(PyExc_ValueError,
14875 "incomplete format key");
14876 return -1;
14877 }
14878 key = PyUnicode_Substring(ctx->fmtstr,
14879 keystart, keystart + keylen);
14880 if (key == NULL)
14881 return -1;
14882 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014883 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014884 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014885 }
14886 ctx->args = PyObject_GetItem(ctx->dict, key);
14887 Py_DECREF(key);
14888 if (ctx->args == NULL)
14889 return -1;
14890 ctx->args_owned = 1;
14891 ctx->arglen = -1;
14892 ctx->argidx = -2;
14893 }
14894
14895 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014896 while (--ctx->fmtcnt >= 0) {
14897 arg->ch = FORMAT_READ(ctx);
14898 ctx->fmtpos++;
14899 switch (arg->ch) {
14900 case '-': arg->flags |= F_LJUST; continue;
14901 case '+': arg->flags |= F_SIGN; continue;
14902 case ' ': arg->flags |= F_BLANK; continue;
14903 case '#': arg->flags |= F_ALT; continue;
14904 case '0': arg->flags |= F_ZERO; continue;
14905 }
14906 break;
14907 }
14908
14909 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014910 if (arg->ch == '*') {
14911 v = unicode_format_getnextarg(ctx);
14912 if (v == NULL)
14913 return -1;
14914 if (!PyLong_Check(v)) {
14915 PyErr_SetString(PyExc_TypeError,
14916 "* wants int");
14917 return -1;
14918 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014919 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014920 if (arg->width == -1 && PyErr_Occurred())
14921 return -1;
14922 if (arg->width < 0) {
14923 arg->flags |= F_LJUST;
14924 arg->width = -arg->width;
14925 }
14926 if (--ctx->fmtcnt >= 0) {
14927 arg->ch = FORMAT_READ(ctx);
14928 ctx->fmtpos++;
14929 }
14930 }
14931 else if (arg->ch >= '0' && arg->ch <= '9') {
14932 arg->width = arg->ch - '0';
14933 while (--ctx->fmtcnt >= 0) {
14934 arg->ch = FORMAT_READ(ctx);
14935 ctx->fmtpos++;
14936 if (arg->ch < '0' || arg->ch > '9')
14937 break;
14938 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14939 mixing signed and unsigned comparison. Since arg->ch is between
14940 '0' and '9', casting to int is safe. */
14941 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14942 PyErr_SetString(PyExc_ValueError,
14943 "width too big");
14944 return -1;
14945 }
14946 arg->width = arg->width*10 + (arg->ch - '0');
14947 }
14948 }
14949
14950 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014951 if (arg->ch == '.') {
14952 arg->prec = 0;
14953 if (--ctx->fmtcnt >= 0) {
14954 arg->ch = FORMAT_READ(ctx);
14955 ctx->fmtpos++;
14956 }
14957 if (arg->ch == '*') {
14958 v = unicode_format_getnextarg(ctx);
14959 if (v == NULL)
14960 return -1;
14961 if (!PyLong_Check(v)) {
14962 PyErr_SetString(PyExc_TypeError,
14963 "* wants int");
14964 return -1;
14965 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014966 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014967 if (arg->prec == -1 && PyErr_Occurred())
14968 return -1;
14969 if (arg->prec < 0)
14970 arg->prec = 0;
14971 if (--ctx->fmtcnt >= 0) {
14972 arg->ch = FORMAT_READ(ctx);
14973 ctx->fmtpos++;
14974 }
14975 }
14976 else if (arg->ch >= '0' && arg->ch <= '9') {
14977 arg->prec = arg->ch - '0';
14978 while (--ctx->fmtcnt >= 0) {
14979 arg->ch = FORMAT_READ(ctx);
14980 ctx->fmtpos++;
14981 if (arg->ch < '0' || arg->ch > '9')
14982 break;
14983 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14984 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014985 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014986 return -1;
14987 }
14988 arg->prec = arg->prec*10 + (arg->ch - '0');
14989 }
14990 }
14991 }
14992
14993 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14994 if (ctx->fmtcnt >= 0) {
14995 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14996 if (--ctx->fmtcnt >= 0) {
14997 arg->ch = FORMAT_READ(ctx);
14998 ctx->fmtpos++;
14999 }
15000 }
15001 }
15002 if (ctx->fmtcnt < 0) {
15003 PyErr_SetString(PyExc_ValueError,
15004 "incomplete format");
15005 return -1;
15006 }
15007 return 0;
15008
15009#undef FORMAT_READ
15010}
15011
15012/* Format one argument. Supported conversion specifiers:
15013
15014 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015015 - "i", "d", "u": int or float
15016 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015017 - "e", "E", "f", "F", "g", "G": float
15018 - "c": int or str (1 character)
15019
Victor Stinner8dbd4212012-12-04 09:30:24 +010015020 When possible, the output is written directly into the Unicode writer
15021 (ctx->writer). A string is created when padding is required.
15022
Victor Stinnera47082312012-10-04 02:19:54 +020015023 Return 0 if the argument has been formatted into *p_str,
15024 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015025 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015026static int
15027unicode_format_arg_format(struct unicode_formatter_t *ctx,
15028 struct unicode_format_arg_t *arg,
15029 PyObject **p_str)
15030{
15031 PyObject *v;
15032 _PyUnicodeWriter *writer = &ctx->writer;
15033
15034 if (ctx->fmtcnt == 0)
15035 ctx->writer.overallocate = 0;
15036
Victor Stinnera47082312012-10-04 02:19:54 +020015037 v = unicode_format_getnextarg(ctx);
15038 if (v == NULL)
15039 return -1;
15040
Victor Stinnera47082312012-10-04 02:19:54 +020015041
15042 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015043 case 's':
15044 case 'r':
15045 case 'a':
15046 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15047 /* Fast path */
15048 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15049 return -1;
15050 return 1;
15051 }
15052
15053 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15054 *p_str = v;
15055 Py_INCREF(*p_str);
15056 }
15057 else {
15058 if (arg->ch == 's')
15059 *p_str = PyObject_Str(v);
15060 else if (arg->ch == 'r')
15061 *p_str = PyObject_Repr(v);
15062 else
15063 *p_str = PyObject_ASCII(v);
15064 }
15065 break;
15066
15067 case 'i':
15068 case 'd':
15069 case 'u':
15070 case 'o':
15071 case 'x':
15072 case 'X':
15073 {
15074 int ret = mainformatlong(v, arg, p_str, writer);
15075 if (ret != 0)
15076 return ret;
15077 arg->sign = 1;
15078 break;
15079 }
15080
15081 case 'e':
15082 case 'E':
15083 case 'f':
15084 case 'F':
15085 case 'g':
15086 case 'G':
15087 if (arg->width == -1 && arg->prec == -1
15088 && !(arg->flags & (F_SIGN | F_BLANK)))
15089 {
15090 /* Fast path */
15091 if (formatfloat(v, arg, NULL, writer) == -1)
15092 return -1;
15093 return 1;
15094 }
15095
15096 arg->sign = 1;
15097 if (formatfloat(v, arg, p_str, NULL) == -1)
15098 return -1;
15099 break;
15100
15101 case 'c':
15102 {
15103 Py_UCS4 ch = formatchar(v);
15104 if (ch == (Py_UCS4) -1)
15105 return -1;
15106 if (arg->width == -1 && arg->prec == -1) {
15107 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015108 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015109 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015110 return 1;
15111 }
15112 *p_str = PyUnicode_FromOrdinal(ch);
15113 break;
15114 }
15115
15116 default:
15117 PyErr_Format(PyExc_ValueError,
15118 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015119 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015120 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15121 (int)arg->ch,
15122 ctx->fmtpos - 1);
15123 return -1;
15124 }
15125 if (*p_str == NULL)
15126 return -1;
15127 assert (PyUnicode_Check(*p_str));
15128 return 0;
15129}
15130
15131static int
15132unicode_format_arg_output(struct unicode_formatter_t *ctx,
15133 struct unicode_format_arg_t *arg,
15134 PyObject *str)
15135{
15136 Py_ssize_t len;
15137 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015138 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015139 Py_ssize_t pindex;
15140 Py_UCS4 signchar;
15141 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015142 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015143 Py_ssize_t sublen;
15144 _PyUnicodeWriter *writer = &ctx->writer;
15145 Py_UCS4 fill;
15146
15147 fill = ' ';
15148 if (arg->sign && arg->flags & F_ZERO)
15149 fill = '0';
15150
15151 if (PyUnicode_READY(str) == -1)
15152 return -1;
15153
15154 len = PyUnicode_GET_LENGTH(str);
15155 if ((arg->width == -1 || arg->width <= len)
15156 && (arg->prec == -1 || arg->prec >= len)
15157 && !(arg->flags & (F_SIGN | F_BLANK)))
15158 {
15159 /* Fast path */
15160 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15161 return -1;
15162 return 0;
15163 }
15164
15165 /* Truncate the string for "s", "r" and "a" formats
15166 if the precision is set */
15167 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15168 if (arg->prec >= 0 && len > arg->prec)
15169 len = arg->prec;
15170 }
15171
15172 /* Adjust sign and width */
15173 kind = PyUnicode_KIND(str);
15174 pbuf = PyUnicode_DATA(str);
15175 pindex = 0;
15176 signchar = '\0';
15177 if (arg->sign) {
15178 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15179 if (ch == '-' || ch == '+') {
15180 signchar = ch;
15181 len--;
15182 pindex++;
15183 }
15184 else if (arg->flags & F_SIGN)
15185 signchar = '+';
15186 else if (arg->flags & F_BLANK)
15187 signchar = ' ';
15188 else
15189 arg->sign = 0;
15190 }
15191 if (arg->width < len)
15192 arg->width = len;
15193
15194 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015195 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015196 if (!(arg->flags & F_LJUST)) {
15197 if (arg->sign) {
15198 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015199 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015200 }
15201 else {
15202 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015203 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015204 }
15205 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015206 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15207 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015208 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015209 }
15210
Victor Stinnera47082312012-10-04 02:19:54 +020015211 buflen = arg->width;
15212 if (arg->sign && len == arg->width)
15213 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015214 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015215 return -1;
15216
15217 /* Write the sign if needed */
15218 if (arg->sign) {
15219 if (fill != ' ') {
15220 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15221 writer->pos += 1;
15222 }
15223 if (arg->width > len)
15224 arg->width--;
15225 }
15226
15227 /* Write the numeric prefix for "x", "X" and "o" formats
15228 if the alternate form is used.
15229 For example, write "0x" for the "%#x" format. */
15230 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15231 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15232 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15233 if (fill != ' ') {
15234 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15235 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15236 writer->pos += 2;
15237 pindex += 2;
15238 }
15239 arg->width -= 2;
15240 if (arg->width < 0)
15241 arg->width = 0;
15242 len -= 2;
15243 }
15244
15245 /* Pad left with the fill character if needed */
15246 if (arg->width > len && !(arg->flags & F_LJUST)) {
15247 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015248 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015249 writer->pos += sublen;
15250 arg->width = len;
15251 }
15252
15253 /* If padding with spaces: write sign if needed and/or numeric prefix if
15254 the alternate form is used */
15255 if (fill == ' ') {
15256 if (arg->sign) {
15257 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15258 writer->pos += 1;
15259 }
15260 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15261 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15262 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15263 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15264 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15265 writer->pos += 2;
15266 pindex += 2;
15267 }
15268 }
15269
15270 /* Write characters */
15271 if (len) {
15272 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15273 str, pindex, len);
15274 writer->pos += len;
15275 }
15276
15277 /* Pad right with the fill character if needed */
15278 if (arg->width > len) {
15279 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015280 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015281 writer->pos += sublen;
15282 }
15283 return 0;
15284}
15285
15286/* Helper of PyUnicode_Format(): format one arg.
15287 Return 0 on success, raise an exception and return -1 on error. */
15288static int
15289unicode_format_arg(struct unicode_formatter_t *ctx)
15290{
15291 struct unicode_format_arg_t arg;
15292 PyObject *str;
15293 int ret;
15294
Victor Stinner8dbd4212012-12-04 09:30:24 +010015295 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015296 if (arg.ch == '%') {
15297 ctx->fmtpos++;
15298 ctx->fmtcnt--;
15299 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15300 return -1;
15301 return 0;
15302 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015303 arg.flags = 0;
15304 arg.width = -1;
15305 arg.prec = -1;
15306 arg.sign = 0;
15307 str = NULL;
15308
Victor Stinnera47082312012-10-04 02:19:54 +020015309 ret = unicode_format_arg_parse(ctx, &arg);
15310 if (ret == -1)
15311 return -1;
15312
15313 ret = unicode_format_arg_format(ctx, &arg, &str);
15314 if (ret == -1)
15315 return -1;
15316
15317 if (ret != 1) {
15318 ret = unicode_format_arg_output(ctx, &arg, str);
15319 Py_DECREF(str);
15320 if (ret == -1)
15321 return -1;
15322 }
15323
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015324 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015325 PyErr_SetString(PyExc_TypeError,
15326 "not all arguments converted during string formatting");
15327 return -1;
15328 }
15329 return 0;
15330}
15331
Alexander Belopolsky40018472011-02-26 01:02:56 +000015332PyObject *
15333PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015334{
Victor Stinnera47082312012-10-04 02:19:54 +020015335 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015336
Guido van Rossumd57fd912000-03-10 22:53:23 +000015337 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015338 PyErr_BadInternalCall();
15339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015340 }
Victor Stinnera47082312012-10-04 02:19:54 +020015341
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015342 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015343 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015344
15345 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015346 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15347 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15348 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15349 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015350
Victor Stinner8f674cc2013-04-17 23:02:17 +020015351 _PyUnicodeWriter_Init(&ctx.writer);
15352 ctx.writer.min_length = ctx.fmtcnt + 100;
15353 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015354
Guido van Rossumd57fd912000-03-10 22:53:23 +000015355 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015356 ctx.arglen = PyTuple_Size(args);
15357 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015358 }
15359 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015360 ctx.arglen = -1;
15361 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015362 }
Victor Stinnera47082312012-10-04 02:19:54 +020015363 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015364 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015365 ctx.dict = args;
15366 else
15367 ctx.dict = NULL;
15368 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015369
Victor Stinnera47082312012-10-04 02:19:54 +020015370 while (--ctx.fmtcnt >= 0) {
15371 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015372 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015373
15374 nonfmtpos = ctx.fmtpos++;
15375 while (ctx.fmtcnt >= 0 &&
15376 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15377 ctx.fmtpos++;
15378 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015379 }
Victor Stinnera47082312012-10-04 02:19:54 +020015380 if (ctx.fmtcnt < 0) {
15381 ctx.fmtpos--;
15382 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015383 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015384
Victor Stinnercfc4c132013-04-03 01:48:39 +020015385 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15386 nonfmtpos, ctx.fmtpos) < 0)
15387 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 }
15389 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015390 ctx.fmtpos++;
15391 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015392 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015393 }
15394 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015395
Victor Stinnera47082312012-10-04 02:19:54 +020015396 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015397 PyErr_SetString(PyExc_TypeError,
15398 "not all arguments converted during string formatting");
15399 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015400 }
15401
Victor Stinnera47082312012-10-04 02:19:54 +020015402 if (ctx.args_owned) {
15403 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015404 }
Victor Stinnera47082312012-10-04 02:19:54 +020015405 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015406
Benjamin Peterson29060642009-01-31 22:14:21 +000015407 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015408 _PyUnicodeWriter_Dealloc(&ctx.writer);
15409 if (ctx.args_owned) {
15410 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015411 }
15412 return NULL;
15413}
15414
Jeremy Hylton938ace62002-07-17 16:30:39 +000015415static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015416unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15417
Tim Peters6d6c1a32001-08-02 04:15:00 +000015418static PyObject *
15419unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15420{
Benjamin Peterson29060642009-01-31 22:14:21 +000015421 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 static char *kwlist[] = {"object", "encoding", "errors", 0};
15423 char *encoding = NULL;
15424 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015425
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 if (type != &PyUnicode_Type)
15427 return unicode_subtype_new(type, args, kwds);
15428 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015429 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 return NULL;
15431 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015432 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 if (encoding == NULL && errors == NULL)
15434 return PyObject_Str(x);
15435 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015436 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015437}
15438
Guido van Rossume023fe02001-08-30 03:12:59 +000015439static PyObject *
15440unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15441{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015442 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015443 Py_ssize_t length, char_size;
15444 int share_wstr, share_utf8;
15445 unsigned int kind;
15446 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015447
Benjamin Peterson14339b62009-01-31 16:36:08 +000015448 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015449
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015450 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015451 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015452 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015453 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015454 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015455 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015456 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015457 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015458
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015459 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015460 if (self == NULL) {
15461 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 return NULL;
15463 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015464 kind = PyUnicode_KIND(unicode);
15465 length = PyUnicode_GET_LENGTH(unicode);
15466
15467 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015468#ifdef Py_DEBUG
15469 _PyUnicode_HASH(self) = -1;
15470#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015471 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015472#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015473 _PyUnicode_STATE(self).interned = 0;
15474 _PyUnicode_STATE(self).kind = kind;
15475 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015476 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015477 _PyUnicode_STATE(self).ready = 1;
15478 _PyUnicode_WSTR(self) = NULL;
15479 _PyUnicode_UTF8_LENGTH(self) = 0;
15480 _PyUnicode_UTF8(self) = NULL;
15481 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015482 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015483
15484 share_utf8 = 0;
15485 share_wstr = 0;
15486 if (kind == PyUnicode_1BYTE_KIND) {
15487 char_size = 1;
15488 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15489 share_utf8 = 1;
15490 }
15491 else if (kind == PyUnicode_2BYTE_KIND) {
15492 char_size = 2;
15493 if (sizeof(wchar_t) == 2)
15494 share_wstr = 1;
15495 }
15496 else {
15497 assert(kind == PyUnicode_4BYTE_KIND);
15498 char_size = 4;
15499 if (sizeof(wchar_t) == 4)
15500 share_wstr = 1;
15501 }
15502
15503 /* Ensure we won't overflow the length. */
15504 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15505 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015506 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015507 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015508 data = PyObject_MALLOC((length + 1) * char_size);
15509 if (data == NULL) {
15510 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015511 goto onError;
15512 }
15513
Victor Stinnerc3c74152011-10-02 20:39:55 +020015514 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015515 if (share_utf8) {
15516 _PyUnicode_UTF8_LENGTH(self) = length;
15517 _PyUnicode_UTF8(self) = data;
15518 }
15519 if (share_wstr) {
15520 _PyUnicode_WSTR_LENGTH(self) = length;
15521 _PyUnicode_WSTR(self) = (wchar_t *)data;
15522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015523
Christian Heimesf051e432016-09-13 20:22:02 +020015524 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015525 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015526 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015527#ifdef Py_DEBUG
15528 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15529#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015530 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015531 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015532
15533onError:
15534 Py_DECREF(unicode);
15535 Py_DECREF(self);
15536 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015537}
15538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015539PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015540"str(object='') -> str\n\
15541str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015542\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015543Create a new string object from the given object. If encoding or\n\
15544errors is specified, then the object must expose a data buffer\n\
15545that will be decoded using the given encoding and error handler.\n\
15546Otherwise, returns the result of object.__str__() (if defined)\n\
15547or repr(object).\n\
15548encoding defaults to sys.getdefaultencoding().\n\
15549errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015550
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015551static PyObject *unicode_iter(PyObject *seq);
15552
Guido van Rossumd57fd912000-03-10 22:53:23 +000015553PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015554 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015555 "str", /* tp_name */
15556 sizeof(PyUnicodeObject), /* tp_basicsize */
15557 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015558 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015559 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015560 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015561 0, /* tp_getattr */
15562 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015563 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015564 unicode_repr, /* tp_repr */
15565 &unicode_as_number, /* tp_as_number */
15566 &unicode_as_sequence, /* tp_as_sequence */
15567 &unicode_as_mapping, /* tp_as_mapping */
15568 (hashfunc) unicode_hash, /* tp_hash*/
15569 0, /* tp_call*/
15570 (reprfunc) unicode_str, /* tp_str */
15571 PyObject_GenericGetAttr, /* tp_getattro */
15572 0, /* tp_setattro */
15573 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015574 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015575 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15576 unicode_doc, /* tp_doc */
15577 0, /* tp_traverse */
15578 0, /* tp_clear */
15579 PyUnicode_RichCompare, /* tp_richcompare */
15580 0, /* tp_weaklistoffset */
15581 unicode_iter, /* tp_iter */
15582 0, /* tp_iternext */
15583 unicode_methods, /* tp_methods */
15584 0, /* tp_members */
15585 0, /* tp_getset */
15586 &PyBaseObject_Type, /* tp_base */
15587 0, /* tp_dict */
15588 0, /* tp_descr_get */
15589 0, /* tp_descr_set */
15590 0, /* tp_dictoffset */
15591 0, /* tp_init */
15592 0, /* tp_alloc */
15593 unicode_new, /* tp_new */
15594 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015595};
15596
15597/* Initialize the Unicode implementation */
15598
Victor Stinner331a6a52019-05-27 16:39:22 +020015599PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015600_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015601{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015602 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015603 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015604 0x000A, /* LINE FEED */
15605 0x000D, /* CARRIAGE RETURN */
15606 0x001C, /* FILE SEPARATOR */
15607 0x001D, /* GROUP SEPARATOR */
15608 0x001E, /* RECORD SEPARATOR */
15609 0x0085, /* NEXT LINE */
15610 0x2028, /* LINE SEPARATOR */
15611 0x2029, /* PARAGRAPH SEPARATOR */
15612 };
15613
Victor Stinner91698d82020-06-25 14:07:40 +020015614 struct _Py_unicode_state *state = &tstate->interp->unicode;
15615 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015616 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015617 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015618
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015619 if (_Py_IsMainInterpreter(tstate)) {
15620 /* initialize the linebreak bloom filter */
15621 bloom_linebreak = make_bloom_mask(
15622 PyUnicode_2BYTE_KIND, linebreak,
15623 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015624
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015625 if (PyType_Ready(&PyUnicode_Type) < 0) {
15626 return _PyStatus_ERR("Can't initialize unicode type");
15627 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015628
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015629 if (PyType_Ready(&EncodingMapType) < 0) {
15630 return _PyStatus_ERR("Can't initialize encoding map type");
15631 }
15632 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15633 return _PyStatus_ERR("Can't initialize field name iterator type");
15634 }
15635 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15636 return _PyStatus_ERR("Can't initialize formatter iter type");
15637 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015638 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015639 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015640}
15641
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015642
Walter Dörwald16807132007-05-25 13:52:07 +000015643void
15644PyUnicode_InternInPlace(PyObject **p)
15645{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015646 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015647#ifdef Py_DEBUG
15648 assert(s != NULL);
15649 assert(_PyUnicode_CHECK(s));
15650#else
Victor Stinner607b1022020-05-05 18:50:30 +020015651 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015652 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015653 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015654#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015655
Benjamin Peterson14339b62009-01-31 16:36:08 +000015656 /* If it's a subclass, we don't really know what putting
15657 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015658 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015659 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015660 }
15661
15662 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015663 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015664 }
15665
15666#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015667 if (interned == NULL) {
15668 interned = PyDict_New();
15669 if (interned == NULL) {
15670 PyErr_Clear(); /* Don't leave an exception */
15671 return;
15672 }
15673 }
Victor Stinner607b1022020-05-05 18:50:30 +020015674
15675 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015676 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015677 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015678 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015679
Berker Peksagced8d4c2016-07-25 04:40:39 +030015680 if (t == NULL) {
15681 PyErr_Clear();
15682 return;
15683 }
Victor Stinner607b1022020-05-05 18:50:30 +020015684
Berker Peksagced8d4c2016-07-25 04:40:39 +030015685 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015686 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015687 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015688 return;
15689 }
Victor Stinner607b1022020-05-05 18:50:30 +020015690
Benjamin Peterson14339b62009-01-31 16:36:08 +000015691 /* The two references in interned are not counted by refcnt.
15692 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015693 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015694 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015695#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015696}
15697
15698void
15699PyUnicode_InternImmortal(PyObject **p)
15700{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015701 PyUnicode_InternInPlace(p);
15702 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015703 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015704 Py_INCREF(*p);
15705 }
Walter Dörwald16807132007-05-25 13:52:07 +000015706}
15707
15708PyObject *
15709PyUnicode_InternFromString(const char *cp)
15710{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015711 PyObject *s = PyUnicode_FromString(cp);
15712 if (s == NULL)
15713 return NULL;
15714 PyUnicode_InternInPlace(&s);
15715 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015716}
15717
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015718
15719#if defined(WITH_VALGRIND) || defined(__INSURE__)
15720static void
15721unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015722{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015723 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015724 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015725 }
15726 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015727 if (keys == NULL || !PyList_Check(keys)) {
15728 PyErr_Clear();
15729 return;
15730 }
Walter Dörwald16807132007-05-25 13:52:07 +000015731
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015732 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015733 detector, interned unicode strings are not forcibly deallocated;
15734 rather, we give them their stolen references back, and then clear
15735 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015736
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015737 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015738#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015739 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015740
15741 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015742#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015743 for (Py_ssize_t i = 0; i < n; i++) {
15744 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015745 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015746 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015747 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015748 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015749 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015750 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015751#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015752 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015753#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015754 break;
15755 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015756 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015757#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015758 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015759#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015760 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015761 case SSTATE_NOT_INTERNED:
15762 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015763 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015764 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015766 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015767 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015768#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015769 fprintf(stderr,
15770 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15771 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015772#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015773 Py_DECREF(keys);
15774 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015775 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015776}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015777#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015778
15779
15780/********************* Unicode Iterator **************************/
15781
15782typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015783 PyObject_HEAD
15784 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015785 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015786} unicodeiterobject;
15787
15788static void
15789unicodeiter_dealloc(unicodeiterobject *it)
15790{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015791 _PyObject_GC_UNTRACK(it);
15792 Py_XDECREF(it->it_seq);
15793 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015794}
15795
15796static int
15797unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15798{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015799 Py_VISIT(it->it_seq);
15800 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015801}
15802
15803static PyObject *
15804unicodeiter_next(unicodeiterobject *it)
15805{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015806 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015807
Benjamin Peterson14339b62009-01-31 16:36:08 +000015808 assert(it != NULL);
15809 seq = it->it_seq;
15810 if (seq == NULL)
15811 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015812 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015814 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15815 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015816 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015817 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15818 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015819 if (item != NULL)
15820 ++it->it_index;
15821 return item;
15822 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015823
Benjamin Peterson14339b62009-01-31 16:36:08 +000015824 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015825 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015826 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015827}
15828
15829static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015830unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015831{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015832 Py_ssize_t len = 0;
15833 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015834 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015835 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015836}
15837
15838PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15839
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015840static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015841unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015842{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015843 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015844 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015845 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015846 it->it_seq, it->it_index);
15847 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015848 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015849 if (u == NULL)
15850 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015851 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015852 }
15853}
15854
15855PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15856
15857static PyObject *
15858unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15859{
15860 Py_ssize_t index = PyLong_AsSsize_t(state);
15861 if (index == -1 && PyErr_Occurred())
15862 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015863 if (it->it_seq != NULL) {
15864 if (index < 0)
15865 index = 0;
15866 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15867 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15868 it->it_index = index;
15869 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015870 Py_RETURN_NONE;
15871}
15872
15873PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15874
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015875static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015876 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015877 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015878 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15879 reduce_doc},
15880 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15881 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015882 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015883};
15884
15885PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015886 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15887 "str_iterator", /* tp_name */
15888 sizeof(unicodeiterobject), /* tp_basicsize */
15889 0, /* tp_itemsize */
15890 /* methods */
15891 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015892 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015893 0, /* tp_getattr */
15894 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015895 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015896 0, /* tp_repr */
15897 0, /* tp_as_number */
15898 0, /* tp_as_sequence */
15899 0, /* tp_as_mapping */
15900 0, /* tp_hash */
15901 0, /* tp_call */
15902 0, /* tp_str */
15903 PyObject_GenericGetAttr, /* tp_getattro */
15904 0, /* tp_setattro */
15905 0, /* tp_as_buffer */
15906 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15907 0, /* tp_doc */
15908 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15909 0, /* tp_clear */
15910 0, /* tp_richcompare */
15911 0, /* tp_weaklistoffset */
15912 PyObject_SelfIter, /* tp_iter */
15913 (iternextfunc)unicodeiter_next, /* tp_iternext */
15914 unicodeiter_methods, /* tp_methods */
15915 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015916};
15917
15918static PyObject *
15919unicode_iter(PyObject *seq)
15920{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015921 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015922
Benjamin Peterson14339b62009-01-31 16:36:08 +000015923 if (!PyUnicode_Check(seq)) {
15924 PyErr_BadInternalCall();
15925 return NULL;
15926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015927 if (PyUnicode_READY(seq) == -1)
15928 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015929 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15930 if (it == NULL)
15931 return NULL;
15932 it->it_index = 0;
15933 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015934 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015935 _PyObject_GC_TRACK(it);
15936 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015937}
15938
Victor Stinner709d23d2019-05-02 14:56:30 -040015939static int
15940encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015941{
Victor Stinner709d23d2019-05-02 14:56:30 -040015942 int res;
15943 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15944 if (res == -2) {
15945 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15946 return -1;
15947 }
15948 if (res < 0) {
15949 PyErr_NoMemory();
15950 return -1;
15951 }
15952 return 0;
15953}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015954
Victor Stinner709d23d2019-05-02 14:56:30 -040015955
15956static int
15957config_get_codec_name(wchar_t **config_encoding)
15958{
15959 char *encoding;
15960 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15961 return -1;
15962 }
15963
15964 PyObject *name_obj = NULL;
15965 PyObject *codec = _PyCodec_Lookup(encoding);
15966 PyMem_RawFree(encoding);
15967
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015968 if (!codec)
15969 goto error;
15970
15971 name_obj = PyObject_GetAttrString(codec, "name");
15972 Py_CLEAR(codec);
15973 if (!name_obj) {
15974 goto error;
15975 }
15976
Victor Stinner709d23d2019-05-02 14:56:30 -040015977 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15978 Py_DECREF(name_obj);
15979 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015980 goto error;
15981 }
15982
Victor Stinner709d23d2019-05-02 14:56:30 -040015983 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15984 if (raw_wname == NULL) {
15985 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015986 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015987 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015988 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015989
15990 PyMem_RawFree(*config_encoding);
15991 *config_encoding = raw_wname;
15992
15993 PyMem_Free(wname);
15994 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015995
15996error:
15997 Py_XDECREF(codec);
15998 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015999 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016000}
16001
16002
Victor Stinner331a6a52019-05-27 16:39:22 +020016003static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016004init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016005{
Victor Stinner709d23d2019-05-02 14:56:30 -040016006 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016007 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016008 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016009 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016010 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016011 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016012 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016013}
16014
16015
Victor Stinner709d23d2019-05-02 14:56:30 -040016016static int
16017init_fs_codec(PyInterpreterState *interp)
16018{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016019 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016020
16021 _Py_error_handler error_handler;
16022 error_handler = get_error_handler_wide(config->filesystem_errors);
16023 if (error_handler == _Py_ERROR_UNKNOWN) {
16024 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16025 return -1;
16026 }
16027
16028 char *encoding, *errors;
16029 if (encode_wstr_utf8(config->filesystem_encoding,
16030 &encoding,
16031 "filesystem_encoding") < 0) {
16032 return -1;
16033 }
16034
16035 if (encode_wstr_utf8(config->filesystem_errors,
16036 &errors,
16037 "filesystem_errors") < 0) {
16038 PyMem_RawFree(encoding);
16039 return -1;
16040 }
16041
Victor Stinner3d17c042020-05-14 01:48:38 +020016042 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16043 PyMem_RawFree(fs_codec->encoding);
16044 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016045 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016046 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16047 PyMem_RawFree(fs_codec->errors);
16048 fs_codec->errors = errors;
16049 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016050
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016051#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016052 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016053#endif
16054
Victor Stinner709d23d2019-05-02 14:56:30 -040016055 /* At this point, PyUnicode_EncodeFSDefault() and
16056 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16057 the C implementation of the filesystem encoding. */
16058
16059 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16060 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016061 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16062 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016063 PyErr_NoMemory();
16064 return -1;
16065 }
16066 return 0;
16067}
16068
16069
Victor Stinner331a6a52019-05-27 16:39:22 +020016070static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016071init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016072{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016073 PyInterpreterState *interp = tstate->interp;
16074
Victor Stinner709d23d2019-05-02 14:56:30 -040016075 /* Update the filesystem encoding to the normalized Python codec name.
16076 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16077 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016078 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016079 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016080 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016081 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016082 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016083 }
16084
Victor Stinner709d23d2019-05-02 14:56:30 -040016085 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016086 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016087 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016088 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016089}
16090
16091
Victor Stinner331a6a52019-05-27 16:39:22 +020016092PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016093_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016094{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016095 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016096 if (_PyStatus_EXCEPTION(status)) {
16097 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016098 }
16099
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016100 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016101}
16102
16103
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016104static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016105_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016106{
Victor Stinner3d17c042020-05-14 01:48:38 +020016107 PyMem_RawFree(fs_codec->encoding);
16108 fs_codec->encoding = NULL;
16109 fs_codec->utf8 = 0;
16110 PyMem_RawFree(fs_codec->errors);
16111 fs_codec->errors = NULL;
16112 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016113}
16114
16115
Victor Stinner709d23d2019-05-02 14:56:30 -040016116#ifdef MS_WINDOWS
16117int
16118_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16119{
Victor Stinner81a7be32020-04-14 15:14:01 +020016120 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016121 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016122
16123 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16124 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16125 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16126 if (encoding == NULL || errors == NULL) {
16127 PyMem_RawFree(encoding);
16128 PyMem_RawFree(errors);
16129 PyErr_NoMemory();
16130 return -1;
16131 }
16132
16133 PyMem_RawFree(config->filesystem_encoding);
16134 config->filesystem_encoding = encoding;
16135 PyMem_RawFree(config->filesystem_errors);
16136 config->filesystem_errors = errors;
16137
16138 return init_fs_codec(interp);
16139}
16140#endif
16141
16142
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016143void
Victor Stinner3d483342019-11-22 12:27:50 +010016144_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016145{
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016146 struct _Py_unicode_state *state = &tstate->interp->unicode;
16147
16148 int is_main_interp = _Py_IsMainInterpreter(tstate);
16149 if (is_main_interp) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016150#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016151 /* Insure++ is a memory analysis tool that aids in discovering
16152 * memory leaks and other memory problems. On Python exit, the
16153 * interned string dictionaries are flagged as being in use at exit
16154 * (which it is). Under normal circumstances, this is fine because
16155 * the memory will be automatically reclaimed by the system. Under
16156 * memory debugging, it's a huge source of useless noise, so we
16157 * trade off slower shutdown for less distraction in the memory
16158 * reports. -baw
16159 */
16160 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016161#endif /* __INSURE__ */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016162 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016163
Victor Stinner91698d82020-06-25 14:07:40 +020016164 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016165
Victor Stinner2f9ada92020-06-24 02:22:21 +020016166 for (Py_ssize_t i = 0; i < 256; i++) {
16167 Py_CLEAR(state->latin1[i]);
16168 }
16169
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016170 if (is_main_interp) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016171 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016172 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016173
Victor Stinner3d17c042020-05-14 01:48:38 +020016174 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016175}
16176
16177
Georg Brandl66c221e2010-10-14 07:04:07 +000016178/* A _string module, to export formatter_parser and formatter_field_name_split
16179 to the string.Formatter class implemented in Python. */
16180
16181static PyMethodDef _string_methods[] = {
16182 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16183 METH_O, PyDoc_STR("split the argument as a field name")},
16184 {"formatter_parser", (PyCFunction) formatter_parser,
16185 METH_O, PyDoc_STR("parse the argument as a format string")},
16186 {NULL, NULL}
16187};
16188
16189static struct PyModuleDef _string_module = {
16190 PyModuleDef_HEAD_INIT,
16191 "_string",
16192 PyDoc_STR("string helper module"),
16193 0,
16194 _string_methods,
16195 NULL,
16196 NULL,
16197 NULL,
16198 NULL
16199};
16200
16201PyMODINIT_FUNC
16202PyInit__string(void)
16203{
16204 return PyModule_Create(&_string_module);
16205}
16206
16207
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016208#ifdef __cplusplus
16209}
16210#endif