blob: ca68c57534b22995e9c8c0380628c714a9753d53 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner91698d82020-06-25 14:07:40 +020044#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
Victor Stinnere5014be2020-04-14 17:52:15 +020046#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinner91698d82020-06-25 14:07:40 +020047#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
Victor Stinnere5014be2020-04-14 17:52:15 +020050#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner91698d82020-06-25 14:07:40 +020051#include "ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinner666ecfb2020-07-02 01:19:57 +020058/* Uncomment to display statistics on interned strings at exit
59 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010060/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 }
844 return str;
845}
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* --- Bloom Filters ----------------------------------------------------- */
848
849/* stuff to implement simple "bloom filters" for Unicode characters.
850 to keep things simple, we use a single bitmask, using the least 5
851 bits from each unicode characters as the bit index. */
852
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200853/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854
Antoine Pitrouf068f942010-01-13 14:19:12 +0000855#if LONG_BIT >= 128
856#define BLOOM_WIDTH 128
857#elif LONG_BIT >= 64
858#define BLOOM_WIDTH 64
859#elif LONG_BIT >= 32
860#define BLOOM_WIDTH 32
861#else
862#error "LONG_BIT is smaller than 32"
863#endif
864
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865#define BLOOM_MASK unsigned long
866
Serhiy Storchaka05997252013-01-26 12:14:02 +0200867static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870
Benjamin Peterson29060642009-01-31 22:14:21 +0000871#define BLOOM_LINEBREAK(ch) \
872 ((ch) < 128U ? ascii_linebreak[(ch)] : \
873 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700875static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300876make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877{
Victor Stinnera85af502013-04-09 21:53:54 +0200878#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
879 do { \
880 TYPE *data = (TYPE *)PTR; \
881 TYPE *end = data + LEN; \
882 Py_UCS4 ch; \
883 for (; data != end; data++) { \
884 ch = *data; \
885 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
886 } \
887 break; \
888 } while (0)
889
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 /* calculate simple bloom-style bitmask for a given unicode string */
891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
894 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200895 switch (kind) {
896 case PyUnicode_1BYTE_KIND:
897 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
898 break;
899 case PyUnicode_2BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
901 break;
902 case PyUnicode_4BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
904 break;
905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700906 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200909
910#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911}
912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300913static int
914ensure_unicode(PyObject *obj)
915{
916 if (!PyUnicode_Check(obj)) {
917 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200918 "must be str, not %.100s",
919 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920 return -1;
921 }
922 return PyUnicode_READY(obj);
923}
924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925/* Compilation of templated routines */
926
Victor Stinner90ed8a62020-06-24 00:34:07 +0200927#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/asciilib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs1lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
948#include "stringlib/ucs2lib.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/partition.h"
951#include "stringlib/split.h"
952#include "stringlib/count.h"
953#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300954#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200956#include "stringlib/undef.h"
957
958#include "stringlib/ucs4lib.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/partition.h"
961#include "stringlib/split.h"
962#include "stringlib/count.h"
963#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300964#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200965#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200966#include "stringlib/undef.h"
967
Inada Naoki2c4928d2020-06-17 20:09:44 +0900968_Py_COMP_DIAG_PUSH
969_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970#include "stringlib/unicodedefs.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/count.h"
973#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100974#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900975_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200977#undef STRINGLIB_GET_EMPTY
978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001227 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001228 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 }
1230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001231 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001232 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001233 return (PyUnicodeObject *)PyErr_NoMemory();
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 if (length < 0) {
1236 PyErr_SetString(PyExc_SystemError,
1237 "Negative size passed to _PyUnicode_New");
1238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1242 if (unicode == NULL)
1243 return NULL;
1244 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001245
1246 _PyUnicode_WSTR_LENGTH(unicode) = length;
1247 _PyUnicode_HASH(unicode) = -1;
1248 _PyUnicode_STATE(unicode).interned = 0;
1249 _PyUnicode_STATE(unicode).kind = 0;
1250 _PyUnicode_STATE(unicode).compact = 0;
1251 _PyUnicode_STATE(unicode).ready = 0;
1252 _PyUnicode_STATE(unicode).ascii = 0;
1253 _PyUnicode_DATA_ANY(unicode) = NULL;
1254 _PyUnicode_LENGTH(unicode) = 0;
1255 _PyUnicode_UTF8(unicode) = NULL;
1256 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1259 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001260 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001261 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001262 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264
Jeremy Hyltond8082792003-09-16 19:41:39 +00001265 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001266 * the caller fails before initializing str -- unicode_resize()
1267 * reads str[0], and the Keep-Alive optimization can keep memory
1268 * allocated for str alive across a call to unicode_dealloc(unicode).
1269 * We don't want unicode_resize to read uninitialized memory in
1270 * that case.
1271 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode)[0] = 0;
1273 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001274
Victor Stinner7931d9a2011-11-04 00:22:48 +01001275 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 return unicode;
1277}
1278
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279static const char*
1280unicode_kind_name(PyObject *unicode)
1281{
Victor Stinner42dfd712011-10-03 14:41:45 +02001282 /* don't check consistency: unicode_kind_name() is called from
1283 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 if (!PyUnicode_IS_COMPACT(unicode))
1285 {
1286 if (!PyUnicode_IS_READY(unicode))
1287 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 {
1290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "legacy ascii";
1293 else
1294 return "legacy latin1";
1295 case PyUnicode_2BYTE_KIND:
1296 return "legacy UCS2";
1297 case PyUnicode_4BYTE_KIND:
1298 return "legacy UCS4";
1299 default:
1300 return "<legacy invalid kind>";
1301 }
1302 }
1303 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001304 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001306 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 return "ascii";
1308 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 default:
1315 return "<invalid compact kind>";
1316 }
1317}
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001321const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001322 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return _PyUnicode_COMPACT_DATA(unicode);
1329}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001332 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1334 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1335 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1336 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1337 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1338 return PyUnicode_DATA(unicode);
1339}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001340
1341void
1342_PyUnicode_Dump(PyObject *op)
1343{
1344 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1346 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001347 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001350 {
1351 if (ascii->state.ascii)
1352 data = (ascii + 1);
1353 else
1354 data = (compact + 1);
1355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 else
1357 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001358 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->wstr == data)
1361 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001362 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera3b334d2011-10-03 13:53:37 +02001364 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001365 printf(" (%zu), ", compact->wstr_length);
1366 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 }
1369 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001370 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
1374
Victor Stinner91698d82020-06-25 14:07:40 +02001375static int
1376unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1377{
1378 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1379 // optimized to always use state->empty_string without having to check if
1380 // it is NULL or not.
1381 PyObject *empty = PyUnicode_New(1, 0);
1382 if (empty == NULL) {
1383 return -1;
1384 }
1385 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1386 _PyUnicode_LENGTH(empty) = 0;
1387 assert(_PyUnicode_CheckConsistency(empty, 1));
1388
1389 assert(state->empty_string == NULL);
1390 state->empty_string = empty;
1391 return 0;
1392}
1393
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyObject *
1396PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1397{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001398 /* Optimization for empty strings */
1399 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001400 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 }
1402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *obj;
1404 PyCompactUnicodeObject *unicode;
1405 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001406 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001407 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 Py_ssize_t char_size;
1409 Py_ssize_t struct_size;
1410
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 is_ascii = 0;
1412 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 struct_size = sizeof(PyCompactUnicodeObject);
1414 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001415 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 char_size = 1;
1417 is_ascii = 1;
1418 struct_size = sizeof(PyASCIIObject);
1419 }
1420 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001421 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 char_size = 1;
1423 }
1424 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 2;
1427 if (sizeof(wchar_t) == 2)
1428 is_sharing = 1;
1429 }
1430 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001431 if (maxchar > MAX_UNICODE) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "invalid maximum character passed to PyUnicode_New");
1434 return NULL;
1435 }
Victor Stinner8f825062012-04-27 13:55:39 +02001436 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 char_size = 4;
1438 if (sizeof(wchar_t) == 4)
1439 is_sharing = 1;
1440 }
1441
1442 /* Ensure we won't overflow the size. */
1443 if (size < 0) {
1444 PyErr_SetString(PyExc_SystemError,
1445 "Negative size passed to PyUnicode_New");
1446 return NULL;
1447 }
1448 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1449 return PyErr_NoMemory();
1450
1451 /* Duplicated allocation code from _PyObject_New() instead of a call to
1452 * PyObject_New() so we are able to allocate space for the object and
1453 * it's data buffer.
1454 */
1455 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001456 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001458 }
1459 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 unicode = (PyCompactUnicodeObject *)obj;
1462 if (is_ascii)
1463 data = ((PyASCIIObject*)obj) + 1;
1464 else
1465 data = unicode + 1;
1466 _PyUnicode_LENGTH(unicode) = size;
1467 _PyUnicode_HASH(unicode) = -1;
1468 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 _PyUnicode_STATE(unicode).compact = 1;
1471 _PyUnicode_STATE(unicode).ready = 1;
1472 _PyUnicode_STATE(unicode).ascii = is_ascii;
1473 if (is_ascii) {
1474 ((char*)data)[size] = 0;
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 }
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001482 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else {
1485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001489 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 ((Py_UCS4*)data)[size] = 0;
1491 if (is_sharing) {
1492 _PyUnicode_WSTR_LENGTH(unicode) = size;
1493 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1494 }
1495 else {
1496 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 }
1499 }
Victor Stinner8f825062012-04-27 13:55:39 +02001500#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001501 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001502#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return obj;
1505}
1506
1507#if SIZEOF_WCHAR_T == 2
1508/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1509 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001510 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 This function assumes that unicode can hold one more code point than wstr
1513 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001514static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
1518 const wchar_t *iter;
1519 Py_UCS4 *ucs4_out;
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 assert(unicode != NULL);
1522 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1524 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1525
1526 for (iter = begin; iter < end; ) {
1527 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1528 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001529 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1530 && (iter+1) < end
1531 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 {
Victor Stinner551ac952011-11-29 22:58:13 +01001533 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 iter += 2;
1535 }
1536 else {
1537 *ucs4_out++ = *iter;
1538 iter++;
1539 }
1540 }
1541 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
1543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544}
1545#endif
1546
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547static int
Victor Stinner488fa492011-12-12 00:01:39 +01001548unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001549{
Victor Stinner488fa492011-12-12 00:01:39 +01001550 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001551 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001552 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553 return -1;
1554 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001555 return 0;
1556}
1557
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558static int
1559_copy_characters(PyObject *to, Py_ssize_t to_start,
1560 PyObject *from, Py_ssize_t from_start,
1561 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001563 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001564 const void *from_data;
1565 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinneree4544c2012-05-09 22:24:08 +02001567 assert(0 <= how_many);
1568 assert(0 <= from_start);
1569 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerd3f08822012-05-29 12:57:52 +02001574 assert(PyUnicode_Check(to));
1575 assert(PyUnicode_IS_READY(to));
1576 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1577
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 if (how_many == 0)
1579 return 0;
1580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585
Victor Stinnerf1852262012-06-16 16:38:26 +02001586#ifdef Py_DEBUG
1587 if (!check_maxchar
1588 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1589 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 Py_UCS4 ch;
1592 Py_ssize_t i;
1593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1595 assert(ch <= to_maxchar);
1596 }
1597 }
1598#endif
1599
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001600 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (check_maxchar
1602 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1603 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 /* Writing Latin-1 characters into an ASCII string requires to
1605 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 Py_UCS4 max_char;
1607 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001608 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 if (max_char >= 128)
1610 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001611 }
Christian Heimesf051e432016-09-13 20:22:02 +02001612 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001614 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001616 else if (from_kind == PyUnicode_1BYTE_KIND
1617 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS1, Py_UCS2,
1621 PyUnicode_1BYTE_DATA(from) + from_start,
1622 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_2BYTE_DATA(to) + to_start
1624 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001625 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001626 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 && to_kind == PyUnicode_4BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS1, Py_UCS4,
1631 PyUnicode_1BYTE_DATA(from) + from_start,
1632 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_4BYTE_DATA(to) + to_start
1634 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001635 }
1636 else if (from_kind == PyUnicode_2BYTE_KIND
1637 && to_kind == PyUnicode_4BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS2, Py_UCS4,
1641 PyUnicode_2BYTE_DATA(from) + from_start,
1642 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_4BYTE_DATA(to) + to_start
1644 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001645 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1648
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 if (!check_maxchar) {
1650 if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_1BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS1,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_1BYTE_DATA(to) + to_start
1658 );
1659 }
1660 else if (from_kind == PyUnicode_4BYTE_KIND
1661 && to_kind == PyUnicode_1BYTE_KIND)
1662 {
1663 _PyUnicode_CONVERT_BYTES(
1664 Py_UCS4, Py_UCS1,
1665 PyUnicode_4BYTE_DATA(from) + from_start,
1666 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1667 PyUnicode_1BYTE_DATA(to) + to_start
1668 );
1669 }
1670 else if (from_kind == PyUnicode_4BYTE_KIND
1671 && to_kind == PyUnicode_2BYTE_KIND)
1672 {
1673 _PyUnicode_CONVERT_BYTES(
1674 Py_UCS4, Py_UCS2,
1675 PyUnicode_4BYTE_DATA(from) + from_start,
1676 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1677 PyUnicode_2BYTE_DATA(to) + to_start
1678 );
1679 }
1680 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001681 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001682 }
1683 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001684 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 Py_ssize_t i;
1688
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 for (i=0; i < how_many; i++) {
1690 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001691 if (ch > to_maxchar)
1692 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1694 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001695 }
1696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 return 0;
1698}
1699
Victor Stinnerd3f08822012-05-29 12:57:52 +02001700void
1701_PyUnicode_FastCopyCharacters(
1702 PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704{
1705 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1706}
1707
1708Py_ssize_t
1709PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1710 PyObject *from, Py_ssize_t from_start,
1711 Py_ssize_t how_many)
1712{
1713 int err;
1714
1715 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1716 PyErr_BadInternalCall();
1717 return -1;
1718 }
1719
Benjamin Petersonbac79492012-01-14 13:34:47 -05001720 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001722 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001723 return -1;
1724
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001726 PyErr_SetString(PyExc_IndexError, "string index out of range");
1727 return -1;
1728 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if (how_many < 0) {
1734 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1735 return -1;
1736 }
1737 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1739 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001740 "Cannot write %zi characters at %zi "
1741 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 how_many, to_start, PyUnicode_GET_LENGTH(to));
1743 return -1;
1744 }
1745
1746 if (how_many == 0)
1747 return 0;
1748
Victor Stinner488fa492011-12-12 00:01:39 +01001749 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001750 return -1;
1751
1752 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1753 if (err) {
1754 PyErr_Format(PyExc_SystemError,
1755 "Cannot copy %s characters "
1756 "into a string of %s characters",
1757 unicode_kind_name(from),
1758 unicode_kind_name(to));
1759 return -1;
1760 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001761 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Victor Stinner17222162011-09-28 22:15:37 +02001764/* Find the maximum code point and count the number of surrogate pairs so a
1765 correct string length can be computed before converting a string to UCS4.
1766 This function counts single surrogates as a character and not as a pair.
1767
1768 Return 0 on success, or -1 on error. */
1769static int
1770find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1771 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772{
1773 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 *num_surrogates = 0;
1778 *maxchar = 0;
1779
1780 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001782 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1783 && (iter+1) < end
1784 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1785 {
1786 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1787 ++(*num_surrogates);
1788 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 {
1793 ch = *iter;
1794 iter++;
1795 }
1796 if (ch > *maxchar) {
1797 *maxchar = ch;
1798 if (*maxchar > MAX_UNICODE) {
1799 PyErr_Format(PyExc_ValueError,
1800 "character U+%x is not in range [U+0000; U+10ffff]",
1801 ch);
1802 return -1;
1803 }
1804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 return 0;
1807}
1808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809int
1810_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811{
1812 wchar_t *end;
1813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815#if SIZEOF_WCHAR_T == 2
1816 Py_ssize_t length_wo_surrogates;
1817#endif
1818
Georg Brandl7597add2011-10-05 16:36:47 +02001819 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 strings were created using _PyObject_New() and where no canonical
1821 representation (the str field) has been set yet aka strings
1822 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001823 assert(_PyUnicode_CHECK(unicode));
1824 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001827 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001828 /* Actually, it should neither be interned nor be anything else: */
1829 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001832 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
1836 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1838 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 PyErr_NoMemory();
1840 return -1;
1841 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001842 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 _PyUnicode_WSTR(unicode), end,
1844 PyUnicode_1BYTE_DATA(unicode));
1845 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1847 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1848 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001849 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001850 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001851 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 }
1853 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861 }
1862 /* In this case we might have to convert down from 4-byte native
1863 wchar_t to 2-byte unicode. */
1864 else if (maxchar < 65536) {
1865 assert(num_surrogates == 0 &&
1866 "FindMaxCharAndNumSurrogatePairs() messed up");
1867
Victor Stinner506f5922011-09-28 22:34:18 +02001868#if SIZEOF_WCHAR_T == 2
1869 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876#else
1877 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001878 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001879 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001881 PyErr_NoMemory();
1882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinner506f5922011-09-28 22:34:18 +02001884 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1885 _PyUnicode_WSTR(unicode), end,
1886 PyUnicode_2BYTE_DATA(unicode));
1887 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1888 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1889 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001890 _PyUnicode_UTF8(unicode) = NULL;
1891 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001892 PyObject_FREE(_PyUnicode_WSTR(unicode));
1893 _PyUnicode_WSTR(unicode) = NULL;
1894 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
1897 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1898 else {
1899#if SIZEOF_WCHAR_T == 2
1900 /* in case the native representation is 2-bytes, we need to allocate a
1901 new normalized 4-byte version. */
1902 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001903 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1904 PyErr_NoMemory();
1905 return -1;
1906 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1908 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 PyErr_NoMemory();
1910 return -1;
1911 }
1912 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001916 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001918 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 PyObject_FREE(_PyUnicode_WSTR(unicode));
1920 _PyUnicode_WSTR(unicode) = NULL;
1921 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1922#else
1923 assert(num_surrogates == 0);
1924
Victor Stinnerc3c74152011-10-02 20:39:55 +02001925 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001927 _PyUnicode_UTF8(unicode) = NULL;
1928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1930#endif
1931 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1932 }
1933 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return 0;
1936}
1937
Alexander Belopolsky40018472011-02-26 01:02:56 +00001938static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001939unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940{
Walter Dörwald16807132007-05-25 13:52:07 +00001941 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 case SSTATE_NOT_INTERNED:
1943 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_INTERNED_MORTAL:
Victor Stinner607b1022020-05-05 18:50:30 +02001946#ifdef INTERNED_STRINGS
Victor Stinner3549ca32020-07-03 16:59:12 +02001947 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1948 references (key and value) which were ignored by
1949 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1950 to prevent calling unicode_dealloc() again. Adjust refcnt after
1951 PyDict_DelItem(). */
1952 assert(Py_REFCNT(unicode) == 0);
1953 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001954 if (PyDict_DelItem(interned, unicode) != 0) {
1955 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1956 NULL);
1957 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001958 assert(Py_REFCNT(unicode) == 1);
1959 Py_SET_REFCNT(unicode, 0);
Victor Stinner607b1022020-05-05 18:50:30 +02001960#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001961 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1965 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001966
Benjamin Peterson29060642009-01-31 22:14:21 +00001967 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001968 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001969 }
1970
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001971 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001973 }
1974 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001975 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001976 }
1977 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001978 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001981 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001984#ifdef Py_DEBUG
1985static int
1986unicode_is_singleton(PyObject *unicode)
1987{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001988 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001989 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001991 }
Victor Stinner607b1022020-05-05 18:50:30 +02001992 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001993 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1994 {
1995 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001996 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001997 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001998 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001999 }
2000 return 0;
2001}
2002#endif
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004static int
Victor Stinner488fa492011-12-12 00:01:39 +01002005unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002006{
Victor Stinner488fa492011-12-12 00:01:39 +01002007 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 if (Py_REFCNT(unicode) != 1)
2009 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002010 if (_PyUnicode_HASH(unicode) != -1)
2011 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012 if (PyUnicode_CHECK_INTERNED(unicode))
2013 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002014 if (!PyUnicode_CheckExact(unicode))
2015 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002016#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002017 /* singleton refcount is greater than 1 */
2018 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002019#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002020 return 1;
2021}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002022
Victor Stinnerfe226c02011-10-03 03:52:20 +02002023static int
2024unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2025{
2026 PyObject *unicode;
2027 Py_ssize_t old_length;
2028
2029 assert(p_unicode != NULL);
2030 unicode = *p_unicode;
2031
2032 assert(unicode != NULL);
2033 assert(PyUnicode_Check(unicode));
2034 assert(0 <= length);
2035
Victor Stinner910337b2011-10-03 03:20:16 +02002036 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002037 old_length = PyUnicode_WSTR_LENGTH(unicode);
2038 else
2039 old_length = PyUnicode_GET_LENGTH(unicode);
2040 if (old_length == length)
2041 return 0;
2042
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002043 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002044 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002045 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002046 return 0;
2047 }
2048
Victor Stinner488fa492011-12-12 00:01:39 +01002049 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 PyObject *copy = resize_copy(unicode, length);
2051 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002052 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002053 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002055 }
2056
Victor Stinnerfe226c02011-10-03 03:52:20 +02002057 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002058 PyObject *new_unicode = resize_compact(unicode, length);
2059 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002060 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002061 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002062 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002063 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002064 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002068PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002069{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 PyObject *unicode;
2071 if (p_unicode == NULL) {
2072 PyErr_BadInternalCall();
2073 return -1;
2074 }
2075 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002076 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002077 {
2078 PyErr_BadInternalCall();
2079 return -1;
2080 }
2081 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002082}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002083
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002084/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002085
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002086 WARNING: The function doesn't copy the terminating null character and
2087 doesn't check the maximum character (may write a latin1 character in an
2088 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002089static void
2090unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2091 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002092{
2093 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002094 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002095 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002096
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002097 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002098 switch (kind) {
2099 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002100#ifdef Py_DEBUG
2101 if (PyUnicode_IS_ASCII(unicode)) {
2102 Py_UCS4 maxchar = ucs1lib_find_max_char(
2103 (const Py_UCS1*)str,
2104 (const Py_UCS1*)str + len);
2105 assert(maxchar < 128);
2106 }
2107#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002108 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002109 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002110 }
2111 case PyUnicode_2BYTE_KIND: {
2112 Py_UCS2 *start = (Py_UCS2 *)data + index;
2113 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002114
Victor Stinner184252a2012-06-16 02:57:41 +02002115 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002116 *ucs2 = (Py_UCS2)*str;
2117
2118 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002119 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002120 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002121 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002122 Py_UCS4 *start = (Py_UCS4 *)data + index;
2123 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002124
Victor Stinner184252a2012-06-16 02:57:41 +02002125 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 *ucs4 = (Py_UCS4)*str;
2127
2128 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002129 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002130 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002131 default:
2132 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002133 }
2134}
2135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002137get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002139 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002140
Victor Stinner2f9ada92020-06-24 02:22:21 +02002141 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002142 if (unicode) {
2143 Py_INCREF(unicode);
2144 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 }
Victor Stinner607b1022020-05-05 18:50:30 +02002146
2147 unicode = PyUnicode_New(1, ch);
2148 if (!unicode) {
2149 return NULL;
2150 }
2151
2152 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2153 assert(_PyUnicode_CheckConsistency(unicode, 1));
2154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002156 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002157 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158}
2159
Victor Stinner985a82a2014-01-03 12:53:47 +01002160static PyObject*
2161unicode_char(Py_UCS4 ch)
2162{
2163 PyObject *unicode;
2164
2165 assert(ch <= MAX_UNICODE);
2166
Victor Stinner2f9ada92020-06-24 02:22:21 +02002167 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002168 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002169 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002170
Victor Stinner985a82a2014-01-03 12:53:47 +01002171 unicode = PyUnicode_New(1, ch);
2172 if (unicode == NULL)
2173 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002174
2175 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2176 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002177 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002178 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002179 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2180 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2181 }
2182 assert(_PyUnicode_CheckConsistency(unicode, 1));
2183 return unicode;
2184}
2185
Alexander Belopolsky40018472011-02-26 01:02:56 +00002186PyObject *
2187PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002189 if (u == NULL) {
2190 if (size > 0) {
2191 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2192 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2193 "use PyUnicode_New() instead", 1) < 0) {
2194 return NULL;
2195 }
2196 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002197 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002198 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002199
2200 if (size < 0) {
2201 PyErr_BadInternalCall();
2202 return NULL;
2203 }
2204
2205 return PyUnicode_FromWideChar(u, size);
2206}
2207
2208PyObject *
2209PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2210{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002211 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 Py_UCS4 maxchar = 0;
2213 Py_ssize_t num_surrogates;
2214
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002215 if (u == NULL && size != 0) {
2216 PyErr_BadInternalCall();
2217 return NULL;
2218 }
2219
2220 if (size == -1) {
2221 size = wcslen(u);
2222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002224 /* If the Unicode data is known at construction time, we can apply
2225 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002228 if (size == 0)
2229 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 /* Single character Unicode objects in the Latin-1 range are
2232 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002233 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return get_latin1_char((unsigned char)*u);
2235
2236 /* If not empty and not single character, copy the Unicode data
2237 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002238 if (find_maxchar_surrogates(u, u + size,
2239 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 return NULL;
2241
Victor Stinner8faf8212011-12-08 22:14:11 +01002242 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 if (!unicode)
2244 return NULL;
2245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 switch (PyUnicode_KIND(unicode)) {
2247 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002248 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2250 break;
2251 case PyUnicode_2BYTE_KIND:
2252#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002253 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002255 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2257#endif
2258 break;
2259 case PyUnicode_4BYTE_KIND:
2260#if SIZEOF_WCHAR_T == 2
2261 /* This is the only case which has to process surrogates, thus
2262 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002263 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264#else
2265 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002266 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267#endif
2268 break;
2269 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002270 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002273 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274}
2275
Alexander Belopolsky40018472011-02-26 01:02:56 +00002276PyObject *
2277PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002278{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002279 if (size < 0) {
2280 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 return NULL;
2283 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002284 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002285 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002286 }
2287 else {
2288 if (size > 0) {
2289 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2290 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2291 "use PyUnicode_New() instead", 1) < 0) {
2292 return NULL;
2293 }
2294 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002295 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002296 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002297}
2298
Alexander Belopolsky40018472011-02-26 01:02:56 +00002299PyObject *
2300PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002301{
2302 size_t size = strlen(u);
2303 if (size > PY_SSIZE_T_MAX) {
2304 PyErr_SetString(PyExc_OverflowError, "input too long");
2305 return NULL;
2306 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002307 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308}
2309
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002310PyObject *
2311_PyUnicode_FromId(_Py_Identifier *id)
2312{
Victor Stinner297257f2020-06-02 14:39:45 +02002313 if (id->object) {
2314 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002315 }
Victor Stinner297257f2020-06-02 14:39:45 +02002316
2317 PyObject *obj;
2318 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2319 strlen(id->string),
2320 NULL, NULL);
2321 if (!obj) {
2322 return NULL;
2323 }
2324 PyUnicode_InternInPlace(&obj);
2325
2326 assert(!id->next);
2327 id->object = obj;
2328 id->next = static_strings;
2329 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002330 return id->object;
2331}
2332
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002333static void
2334unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002335{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002336 _Py_Identifier *tmp, *s = static_strings;
2337 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002338 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002339 tmp = s->next;
2340 s->next = NULL;
2341 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002342 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002343 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002344}
2345
Benjamin Peterson0df54292012-03-26 14:50:32 -04002346/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002347
Victor Stinnerd3f08822012-05-29 12:57:52 +02002348PyObject*
2349_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002350{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002351 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002352 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002353 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002354#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002355 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002356#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002357 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002358 }
Victor Stinner785938e2011-12-11 20:09:03 +01002359 unicode = PyUnicode_New(size, 127);
2360 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002361 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002362 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2363 assert(_PyUnicode_CheckConsistency(unicode, 1));
2364 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002365}
2366
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002367static Py_UCS4
2368kind_maxchar_limit(unsigned int kind)
2369{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002370 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002371 case PyUnicode_1BYTE_KIND:
2372 return 0x80;
2373 case PyUnicode_2BYTE_KIND:
2374 return 0x100;
2375 case PyUnicode_4BYTE_KIND:
2376 return 0x10000;
2377 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002378 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002379 }
2380}
2381
Victor Stinner702c7342011-10-05 13:50:52 +02002382static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002383_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002386 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002387
Victor Stinner2f9ada92020-06-24 02:22:21 +02002388 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002389 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002390 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002391 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002392 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002393 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002394 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002395
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002396 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 if (!res)
2399 return NULL;
2400 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002401 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002403}
2404
Victor Stinnere57b1c02011-09-28 22:20:48 +02002405static PyObject*
2406_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407{
2408 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002409 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002410
Serhiy Storchaka678db842013-01-26 12:16:36 +02002411 if (size == 0)
2412 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002413 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002414 if (size == 1)
2415 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002416
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002417 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002418 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 if (!res)
2420 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002421 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 else {
2424 _PyUnicode_CONVERT_BYTES(
2425 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2426 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002427 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 return res;
2429}
2430
Victor Stinnere57b1c02011-09-28 22:20:48 +02002431static PyObject*
2432_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433{
2434 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436
Serhiy Storchaka678db842013-01-26 12:16:36 +02002437 if (size == 0)
2438 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002439 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002440 if (size == 1)
2441 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002442
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002443 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002444 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 if (!res)
2446 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (max_char < 256)
2448 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2449 PyUnicode_1BYTE_DATA(res));
2450 else if (max_char < 0x10000)
2451 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2452 PyUnicode_2BYTE_DATA(res));
2453 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002455 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 return res;
2457}
2458
2459PyObject*
2460PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2461{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002462 if (size < 0) {
2463 PyErr_SetString(PyExc_ValueError, "size must be positive");
2464 return NULL;
2465 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002466 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002468 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002470 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002472 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002473 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002474 PyErr_SetString(PyExc_SystemError, "invalid kind");
2475 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477}
2478
Victor Stinnerece58de2012-04-23 23:36:38 +02002479Py_UCS4
2480_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2481{
2482 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002483 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002484
2485 assert(PyUnicode_IS_READY(unicode));
2486 assert(0 <= start);
2487 assert(end <= PyUnicode_GET_LENGTH(unicode));
2488 assert(start <= end);
2489
2490 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2491 return PyUnicode_MAX_CHAR_VALUE(unicode);
2492
2493 if (start == end)
2494 return 127;
2495
Victor Stinner94d558b2012-04-27 22:26:58 +02002496 if (PyUnicode_IS_ASCII(unicode))
2497 return 127;
2498
Victor Stinnerece58de2012-04-23 23:36:38 +02002499 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002500 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002501 endptr = (char *)startptr + end * kind;
2502 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002503 switch(kind) {
2504 case PyUnicode_1BYTE_KIND:
2505 return ucs1lib_find_max_char(startptr, endptr);
2506 case PyUnicode_2BYTE_KIND:
2507 return ucs2lib_find_max_char(startptr, endptr);
2508 case PyUnicode_4BYTE_KIND:
2509 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002510 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002511 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002512 }
2513}
2514
Victor Stinner25a4b292011-10-06 12:31:55 +02002515/* Ensure that a string uses the most efficient storage, if it is not the
2516 case: create a new string with of the right kind. Write NULL into *p_unicode
2517 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002518static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002519unicode_adjust_maxchar(PyObject **p_unicode)
2520{
2521 PyObject *unicode, *copy;
2522 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002523 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002524 unsigned int kind;
2525
2526 assert(p_unicode != NULL);
2527 unicode = *p_unicode;
2528 assert(PyUnicode_IS_READY(unicode));
2529 if (PyUnicode_IS_ASCII(unicode))
2530 return;
2531
2532 len = PyUnicode_GET_LENGTH(unicode);
2533 kind = PyUnicode_KIND(unicode);
2534 if (kind == PyUnicode_1BYTE_KIND) {
2535 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002536 max_char = ucs1lib_find_max_char(u, u + len);
2537 if (max_char >= 128)
2538 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002539 }
2540 else if (kind == PyUnicode_2BYTE_KIND) {
2541 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002542 max_char = ucs2lib_find_max_char(u, u + len);
2543 if (max_char >= 256)
2544 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002545 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002546 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002547 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002548 max_char = ucs4lib_find_max_char(u, u + len);
2549 if (max_char >= 0x10000)
2550 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002551 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002552 else
2553 Py_UNREACHABLE();
2554
Victor Stinner25a4b292011-10-06 12:31:55 +02002555 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002556 if (copy != NULL)
2557 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002558 Py_DECREF(unicode);
2559 *p_unicode = copy;
2560}
2561
Victor Stinner034f6cf2011-09-30 02:26:44 +02002562PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002563_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002564{
Victor Stinner87af4f22011-11-21 23:03:47 +01002565 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002566 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002567
Victor Stinner034f6cf2011-09-30 02:26:44 +02002568 if (!PyUnicode_Check(unicode)) {
2569 PyErr_BadInternalCall();
2570 return NULL;
2571 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002572 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002573 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002574
Victor Stinner87af4f22011-11-21 23:03:47 +01002575 length = PyUnicode_GET_LENGTH(unicode);
2576 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002577 if (!copy)
2578 return NULL;
2579 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2580
Christian Heimesf051e432016-09-13 20:22:02 +02002581 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002582 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002583 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002584 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002585}
2586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587
Victor Stinnerbc603d12011-10-02 01:00:40 +02002588/* Widen Unicode objects to larger buffers. Don't write terminating null
2589 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002591static void*
2592unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002594 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002595
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002596 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002597 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002598 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002599 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600 if (!result)
2601 return PyErr_NoMemory();
2602 assert(skind == PyUnicode_1BYTE_KIND);
2603 _PyUnicode_CONVERT_BYTES(
2604 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002605 (const Py_UCS1 *)data,
2606 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002607 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002609 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002610 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002611 if (!result)
2612 return PyErr_NoMemory();
2613 if (skind == PyUnicode_2BYTE_KIND) {
2614 _PyUnicode_CONVERT_BYTES(
2615 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002616 (const Py_UCS2 *)data,
2617 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002618 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002620 else {
2621 assert(skind == PyUnicode_1BYTE_KIND);
2622 _PyUnicode_CONVERT_BYTES(
2623 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002624 (const Py_UCS1 *)data,
2625 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002626 result);
2627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002629 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002630 Py_UNREACHABLE();
2631 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633}
2634
2635static Py_UCS4*
2636as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2637 int copy_null)
2638{
2639 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002640 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 Py_ssize_t len, targetlen;
2642 if (PyUnicode_READY(string) == -1)
2643 return NULL;
2644 kind = PyUnicode_KIND(string);
2645 data = PyUnicode_DATA(string);
2646 len = PyUnicode_GET_LENGTH(string);
2647 targetlen = len;
2648 if (copy_null)
2649 targetlen++;
2650 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002651 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 if (!target) {
2653 PyErr_NoMemory();
2654 return NULL;
2655 }
2656 }
2657 else {
2658 if (targetsize < targetlen) {
2659 PyErr_Format(PyExc_SystemError,
2660 "string is longer than the buffer");
2661 if (copy_null && 0 < targetsize)
2662 target[0] = 0;
2663 return NULL;
2664 }
2665 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002666 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002667 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002668 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002670 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002671 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002672 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2673 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002674 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002675 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002676 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002677 else {
2678 Py_UNREACHABLE();
2679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 if (copy_null)
2681 target[len] = 0;
2682 return target;
2683}
2684
2685Py_UCS4*
2686PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2687 int copy_null)
2688{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002689 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 PyErr_BadInternalCall();
2691 return NULL;
2692 }
2693 return as_ucs4(string, target, targetsize, copy_null);
2694}
2695
2696Py_UCS4*
2697PyUnicode_AsUCS4Copy(PyObject *string)
2698{
2699 return as_ucs4(string, NULL, 0, 1);
2700}
2701
Victor Stinner15a11362012-10-06 23:48:20 +02002702/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002703 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2704 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2705#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002706
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707static int
2708unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2709 Py_ssize_t width, Py_ssize_t precision)
2710{
2711 Py_ssize_t length, fill, arglen;
2712 Py_UCS4 maxchar;
2713
2714 if (PyUnicode_READY(str) == -1)
2715 return -1;
2716
2717 length = PyUnicode_GET_LENGTH(str);
2718 if ((precision == -1 || precision >= length)
2719 && width <= length)
2720 return _PyUnicodeWriter_WriteStr(writer, str);
2721
2722 if (precision != -1)
2723 length = Py_MIN(precision, length);
2724
2725 arglen = Py_MAX(length, width);
2726 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2727 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2728 else
2729 maxchar = writer->maxchar;
2730
2731 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2732 return -1;
2733
2734 if (width > length) {
2735 fill = width - length;
2736 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2737 return -1;
2738 writer->pos += fill;
2739 }
2740
2741 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2742 str, 0, length);
2743 writer->pos += length;
2744 return 0;
2745}
2746
2747static int
Victor Stinner998b8062018-09-12 00:23:25 +02002748unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002749 Py_ssize_t width, Py_ssize_t precision)
2750{
2751 /* UTF-8 */
2752 Py_ssize_t length;
2753 PyObject *unicode;
2754 int res;
2755
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002756 if (precision == -1) {
2757 length = strlen(str);
2758 }
2759 else {
2760 length = 0;
2761 while (length < precision && str[length]) {
2762 length++;
2763 }
2764 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2766 if (unicode == NULL)
2767 return -1;
2768
2769 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2770 Py_DECREF(unicode);
2771 return res;
2772}
2773
Victor Stinner96865452011-03-01 23:44:09 +00002774static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002775unicode_fromformat_arg(_PyUnicodeWriter *writer,
2776 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002777{
Victor Stinnere215d962012-10-06 23:03:36 +02002778 const char *p;
2779 Py_ssize_t len;
2780 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002781 Py_ssize_t width;
2782 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002783 int longflag;
2784 int longlongflag;
2785 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002787
2788 p = f;
2789 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002790 zeropad = 0;
2791 if (*f == '0') {
2792 zeropad = 1;
2793 f++;
2794 }
Victor Stinner96865452011-03-01 23:44:09 +00002795
2796 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002797 width = -1;
2798 if (Py_ISDIGIT((unsigned)*f)) {
2799 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002800 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002801 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002802 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002803 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002805 return NULL;
2806 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002808 f++;
2809 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 }
2811 precision = -1;
2812 if (*f == '.') {
2813 f++;
2814 if (Py_ISDIGIT((unsigned)*f)) {
2815 precision = (*f - '0');
2816 f++;
2817 while (Py_ISDIGIT((unsigned)*f)) {
2818 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2819 PyErr_SetString(PyExc_ValueError,
2820 "precision too big");
2821 return NULL;
2822 }
2823 precision = (precision * 10) + (*f - '0');
2824 f++;
2825 }
2826 }
Victor Stinner96865452011-03-01 23:44:09 +00002827 if (*f == '%') {
2828 /* "%.3%s" => f points to "3" */
2829 f--;
2830 }
2831 }
2832 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002833 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002834 f--;
2835 }
Victor Stinner96865452011-03-01 23:44:09 +00002836
2837 /* Handle %ld, %lu, %lld and %llu. */
2838 longflag = 0;
2839 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002840 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002841 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002842 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002843 longflag = 1;
2844 ++f;
2845 }
Victor Stinner96865452011-03-01 23:44:09 +00002846 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002847 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002848 longlongflag = 1;
2849 f += 2;
2850 }
Victor Stinner96865452011-03-01 23:44:09 +00002851 }
2852 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002853 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002854 size_tflag = 1;
2855 ++f;
2856 }
Victor Stinnere215d962012-10-06 23:03:36 +02002857
2858 if (f[1] == '\0')
2859 writer->overallocate = 0;
2860
2861 switch (*f) {
2862 case 'c':
2863 {
2864 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002865 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002866 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002867 "character argument not in range(0x110000)");
2868 return NULL;
2869 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002870 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002871 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002872 break;
2873 }
2874
2875 case 'i':
2876 case 'd':
2877 case 'u':
2878 case 'x':
2879 {
2880 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002881 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002882 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002885 if (longflag) {
2886 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2887 }
2888 else if (longlongflag) {
2889 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2890 }
2891 else if (size_tflag) {
2892 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2893 }
2894 else {
2895 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2896 }
Victor Stinnere215d962012-10-06 23:03:36 +02002897 }
2898 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002899 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002900 }
2901 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002902 if (longflag) {
2903 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2904 }
2905 else if (longlongflag) {
2906 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2907 }
2908 else if (size_tflag) {
2909 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2910 }
2911 else {
2912 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2913 }
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 assert(len >= 0);
2916
Victor Stinnere215d962012-10-06 23:03:36 +02002917 if (precision < len)
2918 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002919
2920 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002921 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2922 return NULL;
2923
Victor Stinnere215d962012-10-06 23:03:36 +02002924 if (width > precision) {
2925 Py_UCS4 fillchar;
2926 fill = width - precision;
2927 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002928 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2929 return NULL;
2930 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002931 }
Victor Stinner15a11362012-10-06 23:48:20 +02002932 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002933 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002934 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2935 return NULL;
2936 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002937 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002938
Victor Stinner4a587072013-11-19 12:54:53 +01002939 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2940 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002941 break;
2942 }
2943
2944 case 'p':
2945 {
2946 char number[MAX_LONG_LONG_CHARS];
2947
2948 len = sprintf(number, "%p", va_arg(*vargs, void*));
2949 assert(len >= 0);
2950
2951 /* %p is ill-defined: ensure leading 0x. */
2952 if (number[1] == 'X')
2953 number[1] = 'x';
2954 else if (number[1] != 'x') {
2955 memmove(number + 2, number,
2956 strlen(number) + 1);
2957 number[0] = '0';
2958 number[1] = 'x';
2959 len += 2;
2960 }
2961
Victor Stinner4a587072013-11-19 12:54:53 +01002962 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002963 return NULL;
2964 break;
2965 }
2966
2967 case 's':
2968 {
2969 /* UTF-8 */
2970 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002971 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002972 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002973 break;
2974 }
2975
2976 case 'U':
2977 {
2978 PyObject *obj = va_arg(*vargs, PyObject *);
2979 assert(obj && _PyUnicode_CHECK(obj));
2980
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002981 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002982 return NULL;
2983 break;
2984 }
2985
2986 case 'V':
2987 {
2988 PyObject *obj = va_arg(*vargs, PyObject *);
2989 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002990 if (obj) {
2991 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002992 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002993 return NULL;
2994 }
2995 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002996 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002997 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002998 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002999 }
3000 break;
3001 }
3002
3003 case 'S':
3004 {
3005 PyObject *obj = va_arg(*vargs, PyObject *);
3006 PyObject *str;
3007 assert(obj);
3008 str = PyObject_Str(obj);
3009 if (!str)
3010 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003011 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003012 Py_DECREF(str);
3013 return NULL;
3014 }
3015 Py_DECREF(str);
3016 break;
3017 }
3018
3019 case 'R':
3020 {
3021 PyObject *obj = va_arg(*vargs, PyObject *);
3022 PyObject *repr;
3023 assert(obj);
3024 repr = PyObject_Repr(obj);
3025 if (!repr)
3026 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003027 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003028 Py_DECREF(repr);
3029 return NULL;
3030 }
3031 Py_DECREF(repr);
3032 break;
3033 }
3034
3035 case 'A':
3036 {
3037 PyObject *obj = va_arg(*vargs, PyObject *);
3038 PyObject *ascii;
3039 assert(obj);
3040 ascii = PyObject_ASCII(obj);
3041 if (!ascii)
3042 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003043 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003044 Py_DECREF(ascii);
3045 return NULL;
3046 }
3047 Py_DECREF(ascii);
3048 break;
3049 }
3050
3051 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003052 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003053 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003054 break;
3055
3056 default:
3057 /* if we stumble upon an unknown formatting code, copy the rest
3058 of the format string to the output string. (we cannot just
3059 skip the code, since there's no way to know what's in the
3060 argument list) */
3061 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003062 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003063 return NULL;
3064 f = p+len;
3065 return f;
3066 }
3067
3068 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003069 return f;
3070}
3071
Walter Dörwaldd2034312007-05-18 16:29:38 +00003072PyObject *
3073PyUnicode_FromFormatV(const char *format, va_list vargs)
3074{
Victor Stinnere215d962012-10-06 23:03:36 +02003075 va_list vargs2;
3076 const char *f;
3077 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003078
Victor Stinner8f674cc2013-04-17 23:02:17 +02003079 _PyUnicodeWriter_Init(&writer);
3080 writer.min_length = strlen(format) + 100;
3081 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003082
Benjamin Peterson0c212142016-09-20 20:39:33 -07003083 // Copy varags to be able to pass a reference to a subfunction.
3084 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003085
3086 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003088 f = unicode_fromformat_arg(&writer, f, &vargs2);
3089 if (f == NULL)
3090 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 const char *p;
3094 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003095
Victor Stinnere215d962012-10-06 23:03:36 +02003096 p = f;
3097 do
3098 {
3099 if ((unsigned char)*p > 127) {
3100 PyErr_Format(PyExc_ValueError,
3101 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3102 "string, got a non-ASCII byte: 0x%02x",
3103 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003104 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003105 }
3106 p++;
3107 }
3108 while (*p != '\0' && *p != '%');
3109 len = p - f;
3110
3111 if (*p == '\0')
3112 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003113
3114 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003115 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003116
3117 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003119 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003120 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003121 return _PyUnicodeWriter_Finish(&writer);
3122
3123 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003124 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003125 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003126 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127}
3128
Walter Dörwaldd2034312007-05-18 16:29:38 +00003129PyObject *
3130PyUnicode_FromFormat(const char *format, ...)
3131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003132 PyObject* ret;
3133 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003134
3135#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003137#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003138 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003139#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003140 ret = PyUnicode_FromFormatV(format, vargs);
3141 va_end(vargs);
3142 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003143}
3144
Serhiy Storchakac46db922018-10-23 22:58:24 +03003145static Py_ssize_t
3146unicode_get_widechar_size(PyObject *unicode)
3147{
3148 Py_ssize_t res;
3149
3150 assert(unicode != NULL);
3151 assert(_PyUnicode_CHECK(unicode));
3152
3153 if (_PyUnicode_WSTR(unicode) != NULL) {
3154 return PyUnicode_WSTR_LENGTH(unicode);
3155 }
3156 assert(PyUnicode_IS_READY(unicode));
3157
3158 res = _PyUnicode_LENGTH(unicode);
3159#if SIZEOF_WCHAR_T == 2
3160 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3161 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3162 const Py_UCS4 *end = s + res;
3163 for (; s < end; ++s) {
3164 if (*s > 0xFFFF) {
3165 ++res;
3166 }
3167 }
3168 }
3169#endif
3170 return res;
3171}
3172
3173static void
3174unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3175{
3176 const wchar_t *wstr;
3177
3178 assert(unicode != NULL);
3179 assert(_PyUnicode_CHECK(unicode));
3180
3181 wstr = _PyUnicode_WSTR(unicode);
3182 if (wstr != NULL) {
3183 memcpy(w, wstr, size * sizeof(wchar_t));
3184 return;
3185 }
3186 assert(PyUnicode_IS_READY(unicode));
3187
3188 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3189 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3190 for (; size--; ++s, ++w) {
3191 *w = *s;
3192 }
3193 }
3194 else {
3195#if SIZEOF_WCHAR_T == 4
3196 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3197 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3198 for (; size--; ++s, ++w) {
3199 *w = *s;
3200 }
3201#else
3202 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3203 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3204 for (; size--; ++s, ++w) {
3205 Py_UCS4 ch = *s;
3206 if (ch > 0xFFFF) {
3207 assert(ch <= MAX_UNICODE);
3208 /* encode surrogate pair in this case */
3209 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3210 if (!size--)
3211 break;
3212 *w = Py_UNICODE_LOW_SURROGATE(ch);
3213 }
3214 else {
3215 *w = ch;
3216 }
3217 }
3218#endif
3219 }
3220}
3221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003222#ifdef HAVE_WCHAR_H
3223
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003224/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003225
Victor Stinnerd88d9832011-09-06 02:00:05 +02003226 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003227 character) required to convert the unicode object. Ignore size argument.
3228
Victor Stinnerd88d9832011-09-06 02:00:05 +02003229 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003230 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003231 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003232Py_ssize_t
3233PyUnicode_AsWideChar(PyObject *unicode,
3234 wchar_t *w,
3235 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003236{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003237 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003238
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003239 if (unicode == NULL) {
3240 PyErr_BadInternalCall();
3241 return -1;
3242 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003243 if (!PyUnicode_Check(unicode)) {
3244 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003245 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003246 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003247
3248 res = unicode_get_widechar_size(unicode);
3249 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003250 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003251 }
3252
3253 if (size > res) {
3254 size = res + 1;
3255 }
3256 else {
3257 res = size;
3258 }
3259 unicode_copy_as_widechar(unicode, w, size);
3260 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003261}
3262
Victor Stinner137c34c2010-09-29 10:25:54 +00003263wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003264PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003265 Py_ssize_t *size)
3266{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003267 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003268 Py_ssize_t buflen;
3269
3270 if (unicode == NULL) {
3271 PyErr_BadInternalCall();
3272 return NULL;
3273 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003274 if (!PyUnicode_Check(unicode)) {
3275 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003276 return NULL;
3277 }
3278
Serhiy Storchakac46db922018-10-23 22:58:24 +03003279 buflen = unicode_get_widechar_size(unicode);
3280 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003281 if (buffer == NULL) {
3282 PyErr_NoMemory();
3283 return NULL;
3284 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003285 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3286 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003287 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003288 }
3289 else if (wcslen(buffer) != (size_t)buflen) {
3290 PyMem_FREE(buffer);
3291 PyErr_SetString(PyExc_ValueError,
3292 "embedded null character");
3293 return NULL;
3294 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003295 return buffer;
3296}
3297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003298#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003300int
3301_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3302{
3303 wchar_t **p = (wchar_t **)ptr;
3304 if (obj == NULL) {
3305#if !USE_UNICODE_WCHAR_CACHE
3306 PyMem_Free(*p);
3307#endif /* USE_UNICODE_WCHAR_CACHE */
3308 *p = NULL;
3309 return 1;
3310 }
3311 if (PyUnicode_Check(obj)) {
3312#if USE_UNICODE_WCHAR_CACHE
3313_Py_COMP_DIAG_PUSH
3314_Py_COMP_DIAG_IGNORE_DEPR_DECLS
3315 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3316 if (*p == NULL) {
3317 return 0;
3318 }
3319 return 1;
3320_Py_COMP_DIAG_POP
3321#else /* USE_UNICODE_WCHAR_CACHE */
3322 *p = PyUnicode_AsWideCharString(obj, NULL);
3323 if (*p == NULL) {
3324 return 0;
3325 }
3326 return Py_CLEANUP_SUPPORTED;
3327#endif /* USE_UNICODE_WCHAR_CACHE */
3328 }
3329 PyErr_Format(PyExc_TypeError,
3330 "argument must be str, not %.50s",
3331 obj->ob_type->tp_name);
3332 return 0;
3333}
3334
3335int
3336_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3337{
3338 wchar_t **p = (wchar_t **)ptr;
3339 if (obj == NULL) {
3340#if !USE_UNICODE_WCHAR_CACHE
3341 PyMem_Free(*p);
3342#endif /* USE_UNICODE_WCHAR_CACHE */
3343 *p = NULL;
3344 return 1;
3345 }
3346 if (obj == Py_None) {
3347 *p = NULL;
3348 return 1;
3349 }
3350 if (PyUnicode_Check(obj)) {
3351#if USE_UNICODE_WCHAR_CACHE
3352_Py_COMP_DIAG_PUSH
3353_Py_COMP_DIAG_IGNORE_DEPR_DECLS
3354 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3355 if (*p == NULL) {
3356 return 0;
3357 }
3358 return 1;
3359_Py_COMP_DIAG_POP
3360#else /* USE_UNICODE_WCHAR_CACHE */
3361 *p = PyUnicode_AsWideCharString(obj, NULL);
3362 if (*p == NULL) {
3363 return 0;
3364 }
3365 return Py_CLEANUP_SUPPORTED;
3366#endif /* USE_UNICODE_WCHAR_CACHE */
3367 }
3368 PyErr_Format(PyExc_TypeError,
3369 "argument must be str or None, not %.50s",
3370 obj->ob_type->tp_name);
3371 return 0;
3372}
3373
Alexander Belopolsky40018472011-02-26 01:02:56 +00003374PyObject *
3375PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003376{
Victor Stinner8faf8212011-12-08 22:14:11 +01003377 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003378 PyErr_SetString(PyExc_ValueError,
3379 "chr() arg not in range(0x110000)");
3380 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003381 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003382
Victor Stinner985a82a2014-01-03 12:53:47 +01003383 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003384}
3385
Alexander Belopolsky40018472011-02-26 01:02:56 +00003386PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003387PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003389 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003390 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003391 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003392 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003393 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 Py_INCREF(obj);
3395 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003396 }
3397 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 /* For a Unicode subtype that's not a Unicode object,
3399 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003400 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003401 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003402 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003403 "Can't convert '%.100s' object to str implicitly",
3404 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003405 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003406}
3407
Alexander Belopolsky40018472011-02-26 01:02:56 +00003408PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003409PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003410 const char *encoding,
3411 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003412{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003413 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003414 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003415
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 PyErr_BadInternalCall();
3418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003420
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003421 /* Decoding bytes objects is the most common case and should be fast */
3422 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003423 if (PyBytes_GET_SIZE(obj) == 0) {
3424 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3425 return NULL;
3426 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003427 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003428 }
3429 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003430 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3431 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003432 }
3433
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003434 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 PyErr_SetString(PyExc_TypeError,
3436 "decoding str is not supported");
3437 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003438 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003439
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003440 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3441 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3442 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003443 "decoding to str: need a bytes-like object, %.80s found",
3444 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003445 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003446 }
Tim Petersced69f82003-09-16 20:30:58 +00003447
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003448 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003449 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003450 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3451 return NULL;
3452 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003453 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003455
Serhiy Storchaka05997252013-01-26 12:14:02 +02003456 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003457 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003458 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459}
3460
Victor Stinnerebe17e02016-10-12 13:57:45 +02003461/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3462 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3463 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003464int
3465_Py_normalize_encoding(const char *encoding,
3466 char *lower,
3467 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003469 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003470 char *l;
3471 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003472 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473
Victor Stinner942889a2016-09-05 15:40:10 -07003474 assert(encoding != NULL);
3475
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003476 e = encoding;
3477 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003478 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003479 punct = 0;
3480 while (1) {
3481 char c = *e;
3482 if (c == 0) {
3483 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003484 }
Victor Stinner942889a2016-09-05 15:40:10 -07003485
3486 if (Py_ISALNUM(c) || c == '.') {
3487 if (punct && l != lower) {
3488 if (l == l_end) {
3489 return 0;
3490 }
3491 *l++ = '_';
3492 }
3493 punct = 0;
3494
3495 if (l == l_end) {
3496 return 0;
3497 }
3498 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003499 }
3500 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003501 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003502 }
Victor Stinner942889a2016-09-05 15:40:10 -07003503
3504 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003505 }
3506 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003507 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510PyObject *
3511PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003512 Py_ssize_t size,
3513 const char *encoding,
3514 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003515{
3516 PyObject *buffer = NULL, *unicode;
3517 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003518 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3519
Victor Stinner22eb6892019-06-26 00:51:05 +02003520 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3521 return NULL;
3522 }
3523
Victor Stinnered076ed2019-06-26 01:49:32 +02003524 if (size == 0) {
3525 _Py_RETURN_UNICODE_EMPTY();
3526 }
3527
Victor Stinner942889a2016-09-05 15:40:10 -07003528 if (encoding == NULL) {
3529 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3530 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003531
Fred Drakee4315f52000-05-09 19:53:39 +00003532 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003533 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3534 char *lower = buflower;
3535
3536 /* Fast paths */
3537 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3538 lower += 3;
3539 if (*lower == '_') {
3540 /* Match "utf8" and "utf_8" */
3541 lower++;
3542 }
3543
3544 if (lower[0] == '8' && lower[1] == 0) {
3545 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3546 }
3547 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3548 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3549 }
3550 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3551 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3552 }
3553 }
3554 else {
3555 if (strcmp(lower, "ascii") == 0
3556 || strcmp(lower, "us_ascii") == 0) {
3557 return PyUnicode_DecodeASCII(s, size, errors);
3558 }
Steve Dowercc16be82016-09-08 10:35:16 -07003559 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003560 else if (strcmp(lower, "mbcs") == 0) {
3561 return PyUnicode_DecodeMBCS(s, size, errors);
3562 }
3563 #endif
3564 else if (strcmp(lower, "latin1") == 0
3565 || strcmp(lower, "latin_1") == 0
3566 || strcmp(lower, "iso_8859_1") == 0
3567 || strcmp(lower, "iso8859_1") == 0) {
3568 return PyUnicode_DecodeLatin1(s, size, errors);
3569 }
3570 }
Victor Stinner37296e82010-06-10 13:36:23 +00003571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003572
3573 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003574 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003575 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003576 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003577 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 if (buffer == NULL)
3579 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003580 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003581 if (unicode == NULL)
3582 goto onError;
3583 if (!PyUnicode_Check(unicode)) {
3584 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003585 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003586 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003587 encoding,
3588 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 Py_DECREF(unicode);
3590 goto onError;
3591 }
3592 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003593 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003594
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 Py_XDECREF(buffer);
3597 return NULL;
3598}
3599
Alexander Belopolsky40018472011-02-26 01:02:56 +00003600PyObject *
3601PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003602 const char *encoding,
3603 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003604{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003605 if (!PyUnicode_Check(unicode)) {
3606 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003607 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003608 }
3609
Serhiy Storchaka00939072016-10-27 21:05:49 +03003610 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3611 "PyUnicode_AsDecodedObject() is deprecated; "
3612 "use PyCodec_Decode() to decode from str", 1) < 0)
3613 return NULL;
3614
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003615 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003617
3618 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003619 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003620}
3621
Alexander Belopolsky40018472011-02-26 01:02:56 +00003622PyObject *
3623PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003624 const char *encoding,
3625 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003626{
3627 PyObject *v;
3628
3629 if (!PyUnicode_Check(unicode)) {
3630 PyErr_BadArgument();
3631 goto onError;
3632 }
3633
Serhiy Storchaka00939072016-10-27 21:05:49 +03003634 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3635 "PyUnicode_AsDecodedUnicode() is deprecated; "
3636 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3637 return NULL;
3638
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003639 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003640 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003641
3642 /* Decode via the codec registry */
3643 v = PyCodec_Decode(unicode, encoding, errors);
3644 if (v == NULL)
3645 goto onError;
3646 if (!PyUnicode_Check(v)) {
3647 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003648 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003649 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003650 encoding,
3651 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003652 Py_DECREF(v);
3653 goto onError;
3654 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003655 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003656
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658 return NULL;
3659}
3660
Alexander Belopolsky40018472011-02-26 01:02:56 +00003661PyObject *
3662PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003663 Py_ssize_t size,
3664 const char *encoding,
3665 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003666{
3667 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003668
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003669 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3673 Py_DECREF(unicode);
3674 return v;
3675}
3676
Alexander Belopolsky40018472011-02-26 01:02:56 +00003677PyObject *
3678PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003679 const char *encoding,
3680 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003681{
3682 PyObject *v;
3683
3684 if (!PyUnicode_Check(unicode)) {
3685 PyErr_BadArgument();
3686 goto onError;
3687 }
3688
Serhiy Storchaka00939072016-10-27 21:05:49 +03003689 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3690 "PyUnicode_AsEncodedObject() is deprecated; "
3691 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3692 "or PyCodec_Encode() for generic encoding", 1) < 0)
3693 return NULL;
3694
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003695 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003697
3698 /* Encode via the codec registry */
3699 v = PyCodec_Encode(unicode, encoding, errors);
3700 if (v == NULL)
3701 goto onError;
3702 return v;
3703
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003705 return NULL;
3706}
3707
Victor Stinner1b579672011-12-17 05:47:23 +01003708
Victor Stinner2cba6b82018-01-10 22:46:15 +01003709static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003710unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003711 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003712{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003713 Py_ssize_t wlen;
3714 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3715 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003716 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003717 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003718
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003719 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003720 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003721 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003722 return NULL;
3723 }
3724
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003725 char *str;
3726 size_t error_pos;
3727 const char *reason;
3728 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003729 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003730 PyMem_Free(wstr);
3731
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003732 if (res != 0) {
3733 if (res == -2) {
3734 PyObject *exc;
3735 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3736 "locale", unicode,
3737 (Py_ssize_t)error_pos,
3738 (Py_ssize_t)(error_pos+1),
3739 reason);
3740 if (exc != NULL) {
3741 PyCodec_StrictErrors(exc);
3742 Py_DECREF(exc);
3743 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003744 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003745 else if (res == -3) {
3746 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3747 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003748 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003749 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003750 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003751 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003752 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003753
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003754 PyObject *bytes = PyBytes_FromString(str);
3755 PyMem_RawFree(str);
3756 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003757}
3758
Victor Stinnerad158722010-10-27 00:25:46 +00003759PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003760PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3761{
Victor Stinner709d23d2019-05-02 14:56:30 -04003762 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3763 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003764}
3765
3766PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003767PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003768{
Victor Stinner81a7be32020-04-14 15:14:01 +02003769 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003770 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3771 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003772 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003773 fs_codec->error_handler,
3774 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003775 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003776#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003777 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003778 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003779 fs_codec->encoding,
3780 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003781 }
Victor Stinnerad158722010-10-27 00:25:46 +00003782#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003783 else {
3784 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3785 machinery is not ready and so cannot be used:
3786 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003787 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3788 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003789 assert(filesystem_errors != NULL);
3790 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3791 assert(errors != _Py_ERROR_UNKNOWN);
3792#ifdef _Py_FORCE_UTF8_FS_ENCODING
3793 return unicode_encode_utf8(unicode, errors, NULL);
3794#else
3795 return unicode_encode_locale(unicode, errors, 0);
3796#endif
3797 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003798}
3799
Alexander Belopolsky40018472011-02-26 01:02:56 +00003800PyObject *
3801PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003802 const char *encoding,
3803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804{
3805 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003806 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003807
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 if (!PyUnicode_Check(unicode)) {
3809 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 }
Fred Drakee4315f52000-05-09 19:53:39 +00003812
Victor Stinner22eb6892019-06-26 00:51:05 +02003813 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3814 return NULL;
3815 }
3816
Victor Stinner942889a2016-09-05 15:40:10 -07003817 if (encoding == NULL) {
3818 return _PyUnicode_AsUTF8String(unicode, errors);
3819 }
3820
Fred Drakee4315f52000-05-09 19:53:39 +00003821 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003822 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3823 char *lower = buflower;
3824
3825 /* Fast paths */
3826 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3827 lower += 3;
3828 if (*lower == '_') {
3829 /* Match "utf8" and "utf_8" */
3830 lower++;
3831 }
3832
3833 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003835 }
3836 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3837 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3838 }
3839 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3840 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3841 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003842 }
Victor Stinner942889a2016-09-05 15:40:10 -07003843 else {
3844 if (strcmp(lower, "ascii") == 0
3845 || strcmp(lower, "us_ascii") == 0) {
3846 return _PyUnicode_AsASCIIString(unicode, errors);
3847 }
Steve Dowercc16be82016-09-08 10:35:16 -07003848#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003849 else if (strcmp(lower, "mbcs") == 0) {
3850 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3851 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003852#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003853 else if (strcmp(lower, "latin1") == 0 ||
3854 strcmp(lower, "latin_1") == 0 ||
3855 strcmp(lower, "iso_8859_1") == 0 ||
3856 strcmp(lower, "iso8859_1") == 0) {
3857 return _PyUnicode_AsLatin1String(unicode, errors);
3858 }
3859 }
Victor Stinner37296e82010-06-10 13:36:23 +00003860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861
3862 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003863 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003865 return NULL;
3866
3867 /* The normal path */
3868 if (PyBytes_Check(v))
3869 return v;
3870
3871 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003872 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003873 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003874 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003875
3876 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003877 "encoder %s returned bytearray instead of bytes; "
3878 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003879 encoding);
3880 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003881 Py_DECREF(v);
3882 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003883 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003884
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003885 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3886 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003887 Py_DECREF(v);
3888 return b;
3889 }
3890
3891 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003892 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003893 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003894 encoding,
3895 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003896 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003897 return NULL;
3898}
3899
Alexander Belopolsky40018472011-02-26 01:02:56 +00003900PyObject *
3901PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003902 const char *encoding,
3903 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003904{
3905 PyObject *v;
3906
3907 if (!PyUnicode_Check(unicode)) {
3908 PyErr_BadArgument();
3909 goto onError;
3910 }
3911
Serhiy Storchaka00939072016-10-27 21:05:49 +03003912 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3913 "PyUnicode_AsEncodedUnicode() is deprecated; "
3914 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3915 return NULL;
3916
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003917 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003918 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003919
3920 /* Encode via the codec registry */
3921 v = PyCodec_Encode(unicode, encoding, errors);
3922 if (v == NULL)
3923 goto onError;
3924 if (!PyUnicode_Check(v)) {
3925 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003926 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003927 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003928 encoding,
3929 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003930 Py_DECREF(v);
3931 goto onError;
3932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003934
Benjamin Peterson29060642009-01-31 22:14:21 +00003935 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 return NULL;
3937}
3938
Victor Stinner2cba6b82018-01-10 22:46:15 +01003939static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003940unicode_decode_locale(const char *str, Py_ssize_t len,
3941 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003942{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003943 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3944 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003945 return NULL;
3946 }
3947
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003948 wchar_t *wstr;
3949 size_t wlen;
3950 const char *reason;
3951 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003952 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003953 if (res != 0) {
3954 if (res == -2) {
3955 PyObject *exc;
3956 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3957 "locale", str, len,
3958 (Py_ssize_t)wlen,
3959 (Py_ssize_t)(wlen + 1),
3960 reason);
3961 if (exc != NULL) {
3962 PyCodec_StrictErrors(exc);
3963 Py_DECREF(exc);
3964 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003965 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003966 else if (res == -3) {
3967 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3968 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003969 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003970 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003971 }
Victor Stinner2f197072011-12-17 07:08:30 +01003972 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003973 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003974
3975 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3976 PyMem_RawFree(wstr);
3977 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003978}
3979
3980PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003981PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3982 const char *errors)
3983{
Victor Stinner709d23d2019-05-02 14:56:30 -04003984 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3985 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003986}
3987
3988PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003989PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003990{
3991 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003992 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3993 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003994}
3995
3996
3997PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003998PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003999 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004000 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4001}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004002
Christian Heimes5894ba72007-11-04 11:43:14 +00004003PyObject*
4004PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4005{
Victor Stinner81a7be32020-04-14 15:14:01 +02004006 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004007 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4008 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004009 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004010 fs_codec->error_handler,
4011 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004012 NULL);
4013 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004014#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004015 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004016 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004017 fs_codec->encoding,
4018 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004019 }
Victor Stinnerad158722010-10-27 00:25:46 +00004020#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004021 else {
4022 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4023 machinery is not ready and so cannot be used:
4024 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004025 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4026 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004027 assert(filesystem_errors != NULL);
4028 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4029 assert(errors != _Py_ERROR_UNKNOWN);
4030#ifdef _Py_FORCE_UTF8_FS_ENCODING
4031 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4032#else
4033 return unicode_decode_locale(s, size, errors, 0);
4034#endif
4035 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004036}
4037
Martin v. Löwis011e8422009-05-05 04:43:17 +00004038
4039int
4040PyUnicode_FSConverter(PyObject* arg, void* addr)
4041{
Brett Cannonec6ce872016-09-06 15:50:29 -07004042 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004043 PyObject *output = NULL;
4044 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004045 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004046 if (arg == NULL) {
4047 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004048 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004049 return 1;
4050 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004051 path = PyOS_FSPath(arg);
4052 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004053 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004054 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004055 if (PyBytes_Check(path)) {
4056 output = path;
4057 }
4058 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4059 output = PyUnicode_EncodeFSDefault(path);
4060 Py_DECREF(path);
4061 if (!output) {
4062 return 0;
4063 }
4064 assert(PyBytes_Check(output));
4065 }
4066
Victor Stinner0ea2a462010-04-30 00:22:08 +00004067 size = PyBytes_GET_SIZE(output);
4068 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004069 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004070 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004071 Py_DECREF(output);
4072 return 0;
4073 }
4074 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004075 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004076}
4077
4078
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004079int
4080PyUnicode_FSDecoder(PyObject* arg, void* addr)
4081{
Brett Cannona5711202016-09-06 19:36:01 -07004082 int is_buffer = 0;
4083 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004084 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004085 if (arg == NULL) {
4086 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004087 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004088 return 1;
4089 }
Brett Cannona5711202016-09-06 19:36:01 -07004090
4091 is_buffer = PyObject_CheckBuffer(arg);
4092 if (!is_buffer) {
4093 path = PyOS_FSPath(arg);
4094 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004095 return 0;
4096 }
Brett Cannona5711202016-09-06 19:36:01 -07004097 }
4098 else {
4099 path = arg;
4100 Py_INCREF(arg);
4101 }
4102
4103 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004104 output = path;
4105 }
4106 else if (PyBytes_Check(path) || is_buffer) {
4107 PyObject *path_bytes = NULL;
4108
4109 if (!PyBytes_Check(path) &&
4110 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004111 "path should be string, bytes, or os.PathLike, not %.200s",
4112 Py_TYPE(arg)->tp_name)) {
4113 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004114 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004115 }
4116 path_bytes = PyBytes_FromObject(path);
4117 Py_DECREF(path);
4118 if (!path_bytes) {
4119 return 0;
4120 }
4121 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4122 PyBytes_GET_SIZE(path_bytes));
4123 Py_DECREF(path_bytes);
4124 if (!output) {
4125 return 0;
4126 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004127 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004128 else {
4129 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004130 "path should be string, bytes, or os.PathLike, not %.200s",
4131 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004132 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004133 return 0;
4134 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004135 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004136 Py_DECREF(output);
4137 return 0;
4138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004140 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004141 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004142 Py_DECREF(output);
4143 return 0;
4144 }
4145 *(PyObject**)addr = output;
4146 return Py_CLEANUP_SUPPORTED;
4147}
4148
4149
Inada Naoki02a4d572020-02-27 13:48:59 +09004150static int unicode_fill_utf8(PyObject *unicode);
4151
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004152const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004153PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004154{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004155 if (!PyUnicode_Check(unicode)) {
4156 PyErr_BadArgument();
4157 return NULL;
4158 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004159 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004160 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004161
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004162 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004163 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164 return NULL;
4165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 }
4167
4168 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004169 *psize = PyUnicode_UTF8_LENGTH(unicode);
4170 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004171}
4172
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004173const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004175{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004176 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4177}
4178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179Py_UNICODE *
4180PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004182 if (!PyUnicode_Check(unicode)) {
4183 PyErr_BadArgument();
4184 return NULL;
4185 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004186 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4187 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004188 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004189 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004190 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004191
Serhiy Storchakac46db922018-10-23 22:58:24 +03004192 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4193 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4194 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004197 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4198 if (w == NULL) {
4199 PyErr_NoMemory();
4200 return NULL;
4201 }
4202 unicode_copy_as_widechar(unicode, w, wlen + 1);
4203 _PyUnicode_WSTR(unicode) = w;
4204 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4205 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206 }
4207 }
4208 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004209 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004210 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004211}
4212
Inada Naoki2c4928d2020-06-17 20:09:44 +09004213/* Deprecated APIs */
4214
4215_Py_COMP_DIAG_PUSH
4216_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4217
Alexander Belopolsky40018472011-02-26 01:02:56 +00004218Py_UNICODE *
4219PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004221 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222}
4223
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004224const Py_UNICODE *
4225_PyUnicode_AsUnicode(PyObject *unicode)
4226{
4227 Py_ssize_t size;
4228 const Py_UNICODE *wstr;
4229
4230 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4231 if (wstr && wcslen(wstr) != (size_t)size) {
4232 PyErr_SetString(PyExc_ValueError, "embedded null character");
4233 return NULL;
4234 }
4235 return wstr;
4236}
4237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238
Alexander Belopolsky40018472011-02-26 01:02:56 +00004239Py_ssize_t
4240PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241{
4242 if (!PyUnicode_Check(unicode)) {
4243 PyErr_BadArgument();
4244 goto onError;
4245 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004246 if (_PyUnicode_WSTR(unicode) == NULL) {
4247 if (PyUnicode_AsUnicode(unicode) == NULL)
4248 goto onError;
4249 }
4250 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 return -1;
4254}
4255
Inada Naoki2c4928d2020-06-17 20:09:44 +09004256_Py_COMP_DIAG_POP
4257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004258Py_ssize_t
4259PyUnicode_GetLength(PyObject *unicode)
4260{
Victor Stinner07621332012-06-16 04:53:46 +02004261 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004262 PyErr_BadArgument();
4263 return -1;
4264 }
Victor Stinner07621332012-06-16 04:53:46 +02004265 if (PyUnicode_READY(unicode) == -1)
4266 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 return PyUnicode_GET_LENGTH(unicode);
4268}
4269
4270Py_UCS4
4271PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4272{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004273 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004274 int kind;
4275
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004276 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004277 PyErr_BadArgument();
4278 return (Py_UCS4)-1;
4279 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004280 if (PyUnicode_READY(unicode) == -1) {
4281 return (Py_UCS4)-1;
4282 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004283 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004284 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004285 return (Py_UCS4)-1;
4286 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004287 data = PyUnicode_DATA(unicode);
4288 kind = PyUnicode_KIND(unicode);
4289 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290}
4291
4292int
4293PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4294{
4295 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004296 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297 return -1;
4298 }
Victor Stinner488fa492011-12-12 00:01:39 +01004299 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004300 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004301 PyErr_SetString(PyExc_IndexError, "string index out of range");
4302 return -1;
4303 }
Victor Stinner488fa492011-12-12 00:01:39 +01004304 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004305 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004306 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4307 PyErr_SetString(PyExc_ValueError, "character out of range");
4308 return -1;
4309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4311 index, ch);
4312 return 0;
4313}
4314
Alexander Belopolsky40018472011-02-26 01:02:56 +00004315const char *
4316PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004317{
Victor Stinner42cb4622010-09-01 19:39:01 +00004318 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004319}
4320
Victor Stinner554f3f02010-06-16 23:33:54 +00004321/* create or adjust a UnicodeDecodeError */
4322static void
4323make_decode_exception(PyObject **exceptionObject,
4324 const char *encoding,
4325 const char *input, Py_ssize_t length,
4326 Py_ssize_t startpos, Py_ssize_t endpos,
4327 const char *reason)
4328{
4329 if (*exceptionObject == NULL) {
4330 *exceptionObject = PyUnicodeDecodeError_Create(
4331 encoding, input, length, startpos, endpos, reason);
4332 }
4333 else {
4334 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4335 goto onError;
4336 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4337 goto onError;
4338 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4339 goto onError;
4340 }
4341 return;
4342
4343onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004344 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004345}
4346
Steve Dowercc16be82016-09-08 10:35:16 -07004347#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004348static int
4349widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4350{
4351 if (newsize > *size) {
4352 wchar_t *newbuf = *buf;
4353 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4354 PyErr_NoMemory();
4355 return -1;
4356 }
4357 *buf = newbuf;
4358 }
4359 *size = newsize;
4360 return 0;
4361}
4362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363/* error handling callback helper:
4364 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004365 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 and adjust various state variables.
4367 return 0 on success, -1 on error
4368*/
4369
Alexander Belopolsky40018472011-02-26 01:02:56 +00004370static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371unicode_decode_call_errorhandler_wchar(
4372 const char *errors, PyObject **errorHandler,
4373 const char *encoding, const char *reason,
4374 const char **input, const char **inend, Py_ssize_t *startinpos,
4375 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004376 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004377{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004378 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379
4380 PyObject *restuple = NULL;
4381 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004382 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004383 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004384 Py_ssize_t requiredsize;
4385 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004386 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387 wchar_t *repwstr;
4388 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389
4390 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 *errorHandler = PyCodec_LookupError(errors);
4392 if (*errorHandler == NULL)
4393 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 }
4395
Victor Stinner554f3f02010-06-16 23:33:54 +00004396 make_decode_exception(exceptionObject,
4397 encoding,
4398 *input, *inend - *input,
4399 *startinpos, *endinpos,
4400 reason);
4401 if (*exceptionObject == NULL)
4402 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403
Petr Viktorinffd97532020-02-11 17:46:57 +01004404 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004408 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004411 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413
4414 /* Copy back the bytes variables, which might have been modified by the
4415 callback */
4416 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4417 if (!inputobj)
4418 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 *input = PyBytes_AS_STRING(inputobj);
4420 insize = PyBytes_GET_SIZE(inputobj);
4421 *inend = *input + insize;
4422 /* we can DECREF safely, as the exception has another reference,
4423 so the object won't go away. */
4424 Py_DECREF(inputobj);
4425
4426 if (newpos<0)
4427 newpos = insize+newpos;
4428 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004429 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 goto onError;
4431 }
4432
4433 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4434 if (repwstr == NULL)
4435 goto onError;
4436 /* need more space? (at least enough for what we
4437 have+the replacement+the rest of the string (starting
4438 at the new input position), so we won't have to check space
4439 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004440 requiredsize = *outpos;
4441 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4442 goto overflow;
4443 requiredsize += repwlen;
4444 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4445 goto overflow;
4446 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004447 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004448 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004449 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004451 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004452 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004453 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004454 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004455 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004456 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004457 *endinpos = newpos;
4458 *inptr = *input + newpos;
4459
4460 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004461 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 return 0;
4463
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004464 overflow:
4465 PyErr_SetString(PyExc_OverflowError,
4466 "decoded result is too long for a Python string");
4467
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004468 onError:
4469 Py_XDECREF(restuple);
4470 return -1;
4471}
Steve Dowercc16be82016-09-08 10:35:16 -07004472#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004473
4474static int
4475unicode_decode_call_errorhandler_writer(
4476 const char *errors, PyObject **errorHandler,
4477 const char *encoding, const char *reason,
4478 const char **input, const char **inend, Py_ssize_t *startinpos,
4479 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4480 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4481{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004482 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483
4484 PyObject *restuple = NULL;
4485 PyObject *repunicode = NULL;
4486 Py_ssize_t insize;
4487 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004488 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004489 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004490 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004491 int need_to_grow = 0;
4492 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004493
4494 if (*errorHandler == NULL) {
4495 *errorHandler = PyCodec_LookupError(errors);
4496 if (*errorHandler == NULL)
4497 goto onError;
4498 }
4499
4500 make_decode_exception(exceptionObject,
4501 encoding,
4502 *input, *inend - *input,
4503 *startinpos, *endinpos,
4504 reason);
4505 if (*exceptionObject == NULL)
4506 goto onError;
4507
Petr Viktorinffd97532020-02-11 17:46:57 +01004508 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004509 if (restuple == NULL)
4510 goto onError;
4511 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004512 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004513 goto onError;
4514 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004515 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004516 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004517
4518 /* Copy back the bytes variables, which might have been modified by the
4519 callback */
4520 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4521 if (!inputobj)
4522 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004523 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004524 *input = PyBytes_AS_STRING(inputobj);
4525 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004526 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004527 /* we can DECREF safely, as the exception has another reference,
4528 so the object won't go away. */
4529 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004530
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004533 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004534 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004536 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537
Victor Stinner170ca6f2013-04-18 00:25:28 +02004538 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004539 if (replen > 1) {
4540 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004541 need_to_grow = 1;
4542 }
4543 new_inptr = *input + newpos;
4544 if (*inend - new_inptr > remain) {
4545 /* We don't know the decoding algorithm here so we make the worst
4546 assumption that one byte decodes to one unicode character.
4547 If unfortunately one byte could decode to more unicode characters,
4548 the decoder may write out-of-bound then. Is it possible for the
4549 algorithms using this function? */
4550 writer->min_length += *inend - new_inptr - remain;
4551 need_to_grow = 1;
4552 }
4553 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004554 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004555 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004556 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4557 goto onError;
4558 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004559 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004560 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004563 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004566 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004567 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004571 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572}
4573
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004574/* --- UTF-7 Codec -------------------------------------------------------- */
4575
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576/* See RFC2152 for details. We encode conservatively and decode liberally. */
4577
4578/* Three simple macros defining base-64. */
4579
4580/* Is c a base-64 character? */
4581
4582#define IS_BASE64(c) \
4583 (((c) >= 'A' && (c) <= 'Z') || \
4584 ((c) >= 'a' && (c) <= 'z') || \
4585 ((c) >= '0' && (c) <= '9') || \
4586 (c) == '+' || (c) == '/')
4587
4588/* given that c is a base-64 character, what is its base-64 value? */
4589
4590#define FROM_BASE64(c) \
4591 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4592 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4593 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4594 (c) == '+' ? 62 : 63)
4595
4596/* What is the base-64 character of the bottom 6 bits of n? */
4597
4598#define TO_BASE64(n) \
4599 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4600
4601/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4602 * decoded as itself. We are permissive on decoding; the only ASCII
4603 * byte not decoding to itself is the + which begins a base64
4604 * string. */
4605
4606#define DECODE_DIRECT(c) \
4607 ((c) <= 127 && (c) != '+')
4608
4609/* The UTF-7 encoder treats ASCII characters differently according to
4610 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4611 * the above). See RFC2152. This array identifies these different
4612 * sets:
4613 * 0 : "Set D"
4614 * alphanumeric and '(),-./:?
4615 * 1 : "Set O"
4616 * !"#$%&*;<=>@[]^_`{|}
4617 * 2 : "whitespace"
4618 * ht nl cr sp
4619 * 3 : special (must be base64 encoded)
4620 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4621 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622
Tim Petersced69f82003-09-16 20:30:58 +00004623static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624char utf7_category[128] = {
4625/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4626 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4627/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4628 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4629/* sp ! " # $ % & ' ( ) * + , - . / */
4630 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4631/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4633/* @ A B C D E F G H I J K L M N O */
4634 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4635/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4637/* ` a b c d e f g h i j k l m n o */
4638 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4639/* p q r s t u v w x y z { | } ~ del */
4640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004641};
4642
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643/* ENCODE_DIRECT: this character should be encoded as itself. The
4644 * answer depends on whether we are encoding set O as itself, and also
4645 * on whether we are encoding whitespace as itself. RFC2152 makes it
4646 * clear that the answers to these questions vary between
4647 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004648
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649#define ENCODE_DIRECT(c, directO, directWS) \
4650 ((c) < 128 && (c) > 0 && \
4651 ((utf7_category[(c)] == 0) || \
4652 (directWS && (utf7_category[(c)] == 2)) || \
4653 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654
Alexander Belopolsky40018472011-02-26 01:02:56 +00004655PyObject *
4656PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004657 Py_ssize_t size,
4658 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004660 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4661}
4662
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663/* The decoder. The only state we preserve is our read position,
4664 * i.e. how many characters we have consumed. So if we end in the
4665 * middle of a shift sequence we have to back off the read position
4666 * and the output to the beginning of the sequence, otherwise we lose
4667 * all the shift state (seen bits, number of bits seen, high
4668 * surrogate). */
4669
Alexander Belopolsky40018472011-02-26 01:02:56 +00004670PyObject *
4671PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004672 Py_ssize_t size,
4673 const char *errors,
4674 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004675{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004677 Py_ssize_t startinpos;
4678 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004679 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681 const char *errmsg = "";
4682 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004683 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 unsigned int base64bits = 0;
4685 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004686 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 PyObject *errorHandler = NULL;
4688 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004690 if (size == 0) {
4691 if (consumed)
4692 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004693 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004694 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004696 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004697 _PyUnicodeWriter_Init(&writer);
4698 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004699
4700 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 e = s + size;
4702
4703 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004704 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004705 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004706 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 if (inShift) { /* in a base-64 section */
4709 if (IS_BASE64(ch)) { /* consume a base-64 character */
4710 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4711 base64bits += 6;
4712 s++;
4713 if (base64bits >= 16) {
4714 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004715 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004716 base64bits -= 16;
4717 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004718 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004719 if (surrogate) {
4720 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004721 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4722 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004723 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004724 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004726 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 }
4728 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004729 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004730 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004731 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004732 }
4733 }
Victor Stinner551ac952011-11-29 22:58:13 +01004734 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 /* first surrogate */
4736 surrogate = outCh;
4737 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004738 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004739 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004740 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 }
4742 }
4743 }
4744 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 if (base64bits > 0) { /* left-over bits */
4747 if (base64bits >= 6) {
4748 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004749 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004750 errmsg = "partial character in shift sequence";
4751 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004753 else {
4754 /* Some bits remain; they should be zero */
4755 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004756 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004757 errmsg = "non-zero padding bits in shift sequence";
4758 goto utf7Error;
4759 }
4760 }
4761 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004762 if (surrogate && DECODE_DIRECT(ch)) {
4763 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4764 goto onError;
4765 }
4766 surrogate = 0;
4767 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 /* '-' is absorbed; other terminating
4769 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004770 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
4773 }
4774 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 s++; /* consume '+' */
4777 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004779 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004780 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004782 else if (s < e && !IS_BASE64(*s)) {
4783 s++;
4784 errmsg = "ill-formed sequence";
4785 goto utf7Error;
4786 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004789 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004790 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004792 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 }
4794 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004796 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004797 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004798 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004799 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 else {
4801 startinpos = s-starts;
4802 s++;
4803 errmsg = "unexpected special character";
4804 goto utf7Error;
4805 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004809 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 errors, &errorHandler,
4811 "utf7", errmsg,
4812 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004813 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004815 }
4816
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 /* end of string */
4818
4819 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4820 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004821 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 if (surrogate ||
4823 (base64bits >= 6) ||
4824 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004825 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004826 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 errors, &errorHandler,
4828 "utf7", "unterminated shift sequence",
4829 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004830 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 goto onError;
4832 if (s < e)
4833 goto restart;
4834 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004835 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004836
4837 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004838 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004839 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004840 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004841 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004842 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004843 writer.kind, writer.data, shiftOutStart);
4844 Py_XDECREF(errorHandler);
4845 Py_XDECREF(exc);
4846 _PyUnicodeWriter_Dealloc(&writer);
4847 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004848 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004849 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004850 }
4851 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004852 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004853 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004854 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004855
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 Py_XDECREF(errorHandler);
4857 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004858 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004859
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 Py_XDECREF(errorHandler);
4862 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004863 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004864 return NULL;
4865}
4866
4867
Alexander Belopolsky40018472011-02-26 01:02:56 +00004868PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004869_PyUnicode_EncodeUTF7(PyObject *str,
4870 int base64SetO,
4871 int base64WhiteSpace,
4872 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004873{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004874 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004875 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004876 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004877 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004878 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004879 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004880 unsigned int base64bits = 0;
4881 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004882 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004883 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004884
Benjamin Petersonbac79492012-01-14 13:34:47 -05004885 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004886 return NULL;
4887 kind = PyUnicode_KIND(str);
4888 data = PyUnicode_DATA(str);
4889 len = PyUnicode_GET_LENGTH(str);
4890
4891 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004893
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004894 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004895 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004896 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004897 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004898 if (v == NULL)
4899 return NULL;
4900
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004901 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004902 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004903 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004904
Antoine Pitrou244651a2009-05-04 18:56:13 +00004905 if (inShift) {
4906 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4907 /* shifting out */
4908 if (base64bits) { /* output remaining bits */
4909 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4910 base64buffer = 0;
4911 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004912 }
4913 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004914 /* Characters not in the BASE64 set implicitly unshift the sequence
4915 so no '-' is required, except if the character is itself a '-' */
4916 if (IS_BASE64(ch) || ch == '-') {
4917 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004918 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004919 *out++ = (char) ch;
4920 }
4921 else {
4922 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004923 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004924 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004925 else { /* not in a shift sequence */
4926 if (ch == '+') {
4927 *out++ = '+';
4928 *out++ = '-';
4929 }
4930 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4931 *out++ = (char) ch;
4932 }
4933 else {
4934 *out++ = '+';
4935 inShift = 1;
4936 goto encode_char;
4937 }
4938 }
4939 continue;
4940encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004941 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004942 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004943
Antoine Pitrou244651a2009-05-04 18:56:13 +00004944 /* code first surrogate */
4945 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004946 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004947 while (base64bits >= 6) {
4948 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4949 base64bits -= 6;
4950 }
4951 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004952 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004953 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004954 base64bits += 16;
4955 base64buffer = (base64buffer << 16) | ch;
4956 while (base64bits >= 6) {
4957 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4958 base64bits -= 6;
4959 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004960 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004961 if (base64bits)
4962 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4963 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004964 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004965 if (_PyBytes_Resize(&v, out - start) < 0)
4966 return NULL;
4967 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004968}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004969PyObject *
4970PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4971 Py_ssize_t size,
4972 int base64SetO,
4973 int base64WhiteSpace,
4974 const char *errors)
4975{
4976 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004977 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004978 if (tmp == NULL)
4979 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004980 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004981 base64WhiteSpace, errors);
4982 Py_DECREF(tmp);
4983 return result;
4984}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004985
Antoine Pitrou244651a2009-05-04 18:56:13 +00004986#undef IS_BASE64
4987#undef FROM_BASE64
4988#undef TO_BASE64
4989#undef DECODE_DIRECT
4990#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004991
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992/* --- UTF-8 Codec -------------------------------------------------------- */
4993
Alexander Belopolsky40018472011-02-26 01:02:56 +00004994PyObject *
4995PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004996 Py_ssize_t size,
4997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998{
Walter Dörwald69652032004-09-07 20:24:22 +00004999 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5000}
5001
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002#include "stringlib/asciilib.h"
5003#include "stringlib/codecs.h"
5004#include "stringlib/undef.h"
5005
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005006#include "stringlib/ucs1lib.h"
5007#include "stringlib/codecs.h"
5008#include "stringlib/undef.h"
5009
5010#include "stringlib/ucs2lib.h"
5011#include "stringlib/codecs.h"
5012#include "stringlib/undef.h"
5013
5014#include "stringlib/ucs4lib.h"
5015#include "stringlib/codecs.h"
5016#include "stringlib/undef.h"
5017
Antoine Pitrouab868312009-01-10 15:40:25 +00005018/* Mask to quickly check whether a C 'long' contains a
5019 non-ASCII, UTF8-encoded char. */
5020#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005021# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005022#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005023# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005024#else
5025# error C 'long' size should be either 4 or 8!
5026#endif
5027
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028static Py_ssize_t
5029ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005031 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005032 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005033
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005034 /*
5035 * Issue #17237: m68k is a bit different from most architectures in
5036 * that objects do not use "natural alignment" - for example, int and
5037 * long are only aligned at 2-byte boundaries. Therefore the assert()
5038 * won't work; also, tests have shown that skipping the "optimised
5039 * version" will even speed up m68k.
5040 */
5041#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005042#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005043 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
5044 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005045 /* Fast path, see in STRINGLIB(utf8_decode) for
5046 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005047 /* Help allocation */
5048 const char *_p = p;
5049 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050 while (_p < aligned_end) {
5051 unsigned long value = *(const unsigned long *) _p;
5052 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005054 *((unsigned long *)q) = value;
5055 _p += SIZEOF_LONG;
5056 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005057 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005058 p = _p;
5059 while (p < end) {
5060 if ((unsigned char)*p & 0x80)
5061 break;
5062 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005066#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005067#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 while (p < end) {
5069 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5070 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005071 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005072 /* Help allocation */
5073 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06005075 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 if (value & ASCII_CHAR_MASK)
5077 break;
5078 _p += SIZEOF_LONG;
5079 }
5080 p = _p;
5081 if (_p == end)
5082 break;
5083 }
5084 if ((unsigned char)*p & 0x80)
5085 break;
5086 ++p;
5087 }
5088 memcpy(dest, start, p - start);
5089 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090}
Antoine Pitrouab868312009-01-10 15:40:25 +00005091
Victor Stinner709d23d2019-05-02 14:56:30 -04005092static PyObject *
5093unicode_decode_utf8(const char *s, Py_ssize_t size,
5094 _Py_error_handler error_handler, const char *errors,
5095 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005096{
Victor Stinner785938e2011-12-11 20:09:03 +01005097 if (size == 0) {
5098 if (consumed)
5099 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005100 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005101 }
5102
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5104 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005105 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005106 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005107 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005109 }
5110
Inada Naoki770847a2019-06-24 12:30:24 +09005111 const char *starts = s;
5112 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005113
Inada Naoki770847a2019-06-24 12:30:24 +09005114 // fast path: try ASCII string.
5115 PyObject *u = PyUnicode_New(size, 127);
5116 if (u == NULL) {
5117 return NULL;
5118 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005119 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005120 if (s == end) {
5121 return u;
5122 }
5123
5124 // Use _PyUnicodeWriter after fast path is failed.
5125 _PyUnicodeWriter writer;
5126 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5127 writer.pos = s - starts;
5128
5129 Py_ssize_t startinpos, endinpos;
5130 const char *errmsg = "";
5131 PyObject *error_handler_obj = NULL;
5132 PyObject *exc = NULL;
5133
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005134 while (s < end) {
5135 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005136 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005137
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005139 if (PyUnicode_IS_ASCII(writer.buffer))
5140 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005142 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005144 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 } else {
5146 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005147 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 }
5149
5150 switch (ch) {
5151 case 0:
5152 if (s == end || consumed)
5153 goto End;
5154 errmsg = "unexpected end of data";
5155 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005156 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005157 break;
5158 case 1:
5159 errmsg = "invalid start byte";
5160 startinpos = s - starts;
5161 endinpos = startinpos + 1;
5162 break;
5163 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005164 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5165 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5166 {
5167 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005168 goto End;
5169 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005170 /* fall through */
5171 case 3:
5172 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005173 errmsg = "invalid continuation byte";
5174 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005175 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005176 break;
5177 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005178 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 goto onError;
5180 continue;
5181 }
5182
Victor Stinner1d65d912015-10-05 13:43:50 +02005183 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005184 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005185
5186 switch (error_handler) {
5187 case _Py_ERROR_IGNORE:
5188 s += (endinpos - startinpos);
5189 break;
5190
5191 case _Py_ERROR_REPLACE:
5192 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5193 goto onError;
5194 s += (endinpos - startinpos);
5195 break;
5196
5197 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005198 {
5199 Py_ssize_t i;
5200
Victor Stinner1d65d912015-10-05 13:43:50 +02005201 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5202 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005203 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005204 ch = (Py_UCS4)(unsigned char)(starts[i]);
5205 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5206 ch + 0xdc00);
5207 writer.pos++;
5208 }
5209 s += (endinpos - startinpos);
5210 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005211 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005212
5213 default:
5214 if (unicode_decode_call_errorhandler_writer(
5215 errors, &error_handler_obj,
5216 "utf-8", errmsg,
5217 &starts, &end, &startinpos, &endinpos, &exc, &s,
5218 &writer))
5219 goto onError;
5220 }
Victor Stinner785938e2011-12-11 20:09:03 +01005221 }
5222
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005223End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005224 if (consumed)
5225 *consumed = s - starts;
5226
Victor Stinner1d65d912015-10-05 13:43:50 +02005227 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005228 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005229 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005230
5231onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005232 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005233 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005234 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005235 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005236}
5237
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005238
Victor Stinner709d23d2019-05-02 14:56:30 -04005239PyObject *
5240PyUnicode_DecodeUTF8Stateful(const char *s,
5241 Py_ssize_t size,
5242 const char *errors,
5243 Py_ssize_t *consumed)
5244{
5245 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5246}
5247
5248
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005249/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5250 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005251
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005252 On success, write a pointer to a newly allocated wide character string into
5253 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5254 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005255
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005256 On memory allocation failure, return -1.
5257
5258 On decoding error (if surrogateescape is zero), return -2. If wlen is
5259 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5260 is not NULL, write the decoding error message into *reason. */
5261int
5262_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005263 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005264{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005265 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005266 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005267 wchar_t *unicode;
5268 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005269
Victor Stinner3d4226a2018-08-29 22:21:32 +02005270 int surrogateescape = 0;
5271 int surrogatepass = 0;
5272 switch (errors)
5273 {
5274 case _Py_ERROR_STRICT:
5275 break;
5276 case _Py_ERROR_SURROGATEESCAPE:
5277 surrogateescape = 1;
5278 break;
5279 case _Py_ERROR_SURROGATEPASS:
5280 surrogatepass = 1;
5281 break;
5282 default:
5283 return -3;
5284 }
5285
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005286 /* Note: size will always be longer than the resulting Unicode
5287 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005288 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005290 }
5291
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005292 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005293 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005294 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005295 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005296
5297 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005298 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005299 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005300 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005301 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005302#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005303 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005304#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005305 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005306#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005307 if (ch > 0xFF) {
5308#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005309 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005310#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005311 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005312 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005313 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5314 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5315#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005316 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005317 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005318 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005319 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005320 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005321
5322 if (surrogateescape) {
5323 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5324 }
5325 else {
5326 /* Is it a valid three-byte code? */
5327 if (surrogatepass
5328 && (e - s) >= 3
5329 && (s[0] & 0xf0) == 0xe0
5330 && (s[1] & 0xc0) == 0x80
5331 && (s[2] & 0xc0) == 0x80)
5332 {
5333 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5334 s += 3;
5335 unicode[outpos++] = ch;
5336 }
5337 else {
5338 PyMem_RawFree(unicode );
5339 if (reason != NULL) {
5340 switch (ch) {
5341 case 0:
5342 *reason = "unexpected end of data";
5343 break;
5344 case 1:
5345 *reason = "invalid start byte";
5346 break;
5347 /* 2, 3, 4 */
5348 default:
5349 *reason = "invalid continuation byte";
5350 break;
5351 }
5352 }
5353 if (wlen != NULL) {
5354 *wlen = s - orig_s;
5355 }
5356 return -2;
5357 }
5358 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005359 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005360 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005361 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005362 if (wlen) {
5363 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005364 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005365 *wstr = unicode;
5366 return 0;
5367}
5368
Victor Stinner5f9cf232019-03-19 01:46:25 +01005369
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005370wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005371_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5372 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005373{
5374 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005375 int res = _Py_DecodeUTF8Ex(arg, arglen,
5376 &wstr, wlen,
5377 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005378 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005379 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5380 assert(res != -3);
5381 if (wlen) {
5382 *wlen = (size_t)res;
5383 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005384 return NULL;
5385 }
5386 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005387}
5388
Antoine Pitrouab868312009-01-10 15:40:25 +00005389
Victor Stinnere47e6982017-12-21 15:45:16 +01005390/* UTF-8 encoder using the surrogateescape error handler .
5391
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005392 On success, return 0 and write the newly allocated character string (use
5393 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005394
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005395 On encoding failure, return -2 and write the position of the invalid
5396 surrogate character into *error_pos (if error_pos is set) and the decoding
5397 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005398
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005399 On memory allocation failure, return -1. */
5400int
5401_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005402 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005403{
5404 const Py_ssize_t max_char_size = 4;
5405 Py_ssize_t len = wcslen(text);
5406
5407 assert(len >= 0);
5408
Victor Stinner3d4226a2018-08-29 22:21:32 +02005409 int surrogateescape = 0;
5410 int surrogatepass = 0;
5411 switch (errors)
5412 {
5413 case _Py_ERROR_STRICT:
5414 break;
5415 case _Py_ERROR_SURROGATEESCAPE:
5416 surrogateescape = 1;
5417 break;
5418 case _Py_ERROR_SURROGATEPASS:
5419 surrogatepass = 1;
5420 break;
5421 default:
5422 return -3;
5423 }
5424
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005425 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5426 return -1;
5427 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005428 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005429 if (raw_malloc) {
5430 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005431 }
5432 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005433 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005434 }
5435 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005436 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005437 }
5438
5439 char *p = bytes;
5440 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005441 for (i = 0; i < len; ) {
5442 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005443 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005444 i++;
5445#if Py_UNICODE_SIZE == 2
5446 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5447 && i < len
5448 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5449 {
5450 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5451 i++;
5452 }
5453#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005454
5455 if (ch < 0x80) {
5456 /* Encode ASCII */
5457 *p++ = (char) ch;
5458
5459 }
5460 else if (ch < 0x0800) {
5461 /* Encode Latin-1 */
5462 *p++ = (char)(0xc0 | (ch >> 6));
5463 *p++ = (char)(0x80 | (ch & 0x3f));
5464 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005465 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005466 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005467 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005468 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005469 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005470 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005471 if (reason != NULL) {
5472 *reason = "encoding error";
5473 }
5474 if (raw_malloc) {
5475 PyMem_RawFree(bytes);
5476 }
5477 else {
5478 PyMem_Free(bytes);
5479 }
5480 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005481 }
5482 *p++ = (char)(ch & 0xff);
5483 }
5484 else if (ch < 0x10000) {
5485 *p++ = (char)(0xe0 | (ch >> 12));
5486 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5487 *p++ = (char)(0x80 | (ch & 0x3f));
5488 }
5489 else { /* ch >= 0x10000 */
5490 assert(ch <= MAX_UNICODE);
5491 /* Encode UCS4 Unicode ordinals */
5492 *p++ = (char)(0xf0 | (ch >> 18));
5493 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5494 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5495 *p++ = (char)(0x80 | (ch & 0x3f));
5496 }
5497 }
5498 *p++ = '\0';
5499
5500 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005501 char *bytes2;
5502 if (raw_malloc) {
5503 bytes2 = PyMem_RawRealloc(bytes, final_size);
5504 }
5505 else {
5506 bytes2 = PyMem_Realloc(bytes, final_size);
5507 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005508 if (bytes2 == NULL) {
5509 if (error_pos != NULL) {
5510 *error_pos = (size_t)-1;
5511 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005512 if (raw_malloc) {
5513 PyMem_RawFree(bytes);
5514 }
5515 else {
5516 PyMem_Free(bytes);
5517 }
5518 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005519 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005520 *str = bytes2;
5521 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005522}
5523
5524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525/* Primary internal function which creates utf8 encoded bytes objects.
5526
5527 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005528 and allocate exactly as much space needed at the end. Else allocate the
5529 maximum possible needed (4 result bytes per Unicode character), and return
5530 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005531*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005532static PyObject *
5533unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5534 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 if (!PyUnicode_Check(unicode)) {
5537 PyErr_BadArgument();
5538 return NULL;
5539 }
5540
5541 if (PyUnicode_READY(unicode) == -1)
5542 return NULL;
5543
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005544 if (PyUnicode_UTF8(unicode))
5545 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5546 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547
Inada Naoki02a4d572020-02-27 13:48:59 +09005548 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005549 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005550 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5551
5552 _PyBytesWriter writer;
5553 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005554
Benjamin Petersonead6b532011-12-20 17:23:42 -06005555 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005557 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005558 case PyUnicode_1BYTE_KIND:
5559 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5560 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005561 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5562 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005563 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005564 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5565 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005566 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005567 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5568 break;
Tim Peters602f7402002-04-27 18:03:26 +00005569 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005570
5571 if (end == NULL) {
5572 _PyBytesWriter_Dealloc(&writer);
5573 return NULL;
5574 }
5575 return _PyBytesWriter_Finish(&writer, end);
5576}
5577
5578static int
5579unicode_fill_utf8(PyObject *unicode)
5580{
5581 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5582 assert(!PyUnicode_IS_ASCII(unicode));
5583
5584 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005585 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005586 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5587
5588 _PyBytesWriter writer;
5589 char *end;
5590
5591 switch (kind) {
5592 default:
5593 Py_UNREACHABLE();
5594 case PyUnicode_1BYTE_KIND:
5595 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5596 _Py_ERROR_STRICT, NULL);
5597 break;
5598 case PyUnicode_2BYTE_KIND:
5599 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5600 _Py_ERROR_STRICT, NULL);
5601 break;
5602 case PyUnicode_4BYTE_KIND:
5603 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5604 _Py_ERROR_STRICT, NULL);
5605 break;
5606 }
5607 if (end == NULL) {
5608 _PyBytesWriter_Dealloc(&writer);
5609 return -1;
5610 }
5611
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005612 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005613 PyBytes_AS_STRING(writer.buffer);
5614 Py_ssize_t len = end - start;
5615
5616 char *cache = PyObject_MALLOC(len + 1);
5617 if (cache == NULL) {
5618 _PyBytesWriter_Dealloc(&writer);
5619 PyErr_NoMemory();
5620 return -1;
5621 }
5622 _PyUnicode_UTF8(unicode) = cache;
5623 _PyUnicode_UTF8_LENGTH(unicode) = len;
5624 memcpy(cache, start, len);
5625 cache[len] = '\0';
5626 _PyBytesWriter_Dealloc(&writer);
5627 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628}
5629
Alexander Belopolsky40018472011-02-26 01:02:56 +00005630PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005631_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5632{
5633 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5634}
5635
5636
5637PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5639 Py_ssize_t size,
5640 const char *errors)
5641{
5642 PyObject *v, *unicode;
5643
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005644 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 if (unicode == NULL)
5646 return NULL;
5647 v = _PyUnicode_AsUTF8String(unicode, errors);
5648 Py_DECREF(unicode);
5649 return v;
5650}
5651
5652PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005653PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656}
5657
Walter Dörwald41980ca2007-08-16 21:55:45 +00005658/* --- UTF-32 Codec ------------------------------------------------------- */
5659
5660PyObject *
5661PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 Py_ssize_t size,
5663 const char *errors,
5664 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005665{
5666 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5667}
5668
5669PyObject *
5670PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 Py_ssize_t size,
5672 const char *errors,
5673 int *byteorder,
5674 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005675{
5676 const char *starts = s;
5677 Py_ssize_t startinpos;
5678 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005679 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005680 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005681 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005682 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005683 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005684 PyObject *errorHandler = NULL;
5685 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005686
Andy Lestere6be9b52020-02-11 20:28:35 -06005687 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005688 e = q + size;
5689
5690 if (byteorder)
5691 bo = *byteorder;
5692
5693 /* Check for BOM marks (U+FEFF) in the input and adjust current
5694 byte order setting accordingly. In native mode, the leading BOM
5695 mark is skipped, in all other modes, it is copied to the output
5696 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005697 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005698 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005699 if (bom == 0x0000FEFF) {
5700 bo = -1;
5701 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005703 else if (bom == 0xFFFE0000) {
5704 bo = 1;
5705 q += 4;
5706 }
5707 if (byteorder)
5708 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005709 }
5710
Victor Stinnere64322e2012-10-30 23:12:47 +01005711 if (q == e) {
5712 if (consumed)
5713 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005714 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005715 }
5716
Victor Stinnere64322e2012-10-30 23:12:47 +01005717#ifdef WORDS_BIGENDIAN
5718 le = bo < 0;
5719#else
5720 le = bo <= 0;
5721#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005723
Victor Stinner8f674cc2013-04-17 23:02:17 +02005724 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005725 writer.min_length = (e - q + 3) / 4;
5726 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005727 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005728
Victor Stinnere64322e2012-10-30 23:12:47 +01005729 while (1) {
5730 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005731 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005732
Victor Stinnere64322e2012-10-30 23:12:47 +01005733 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005734 enum PyUnicode_Kind kind = writer.kind;
5735 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005736 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005737 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005738 if (le) {
5739 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005740 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005741 if (ch > maxch)
5742 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005743 if (kind != PyUnicode_1BYTE_KIND &&
5744 Py_UNICODE_IS_SURROGATE(ch))
5745 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005746 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005747 q += 4;
5748 } while (q <= last);
5749 }
5750 else {
5751 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005752 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005753 if (ch > maxch)
5754 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005755 if (kind != PyUnicode_1BYTE_KIND &&
5756 Py_UNICODE_IS_SURROGATE(ch))
5757 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005758 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005759 q += 4;
5760 } while (q <= last);
5761 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005762 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005763 }
5764
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005766 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 startinpos = ((const char *)q) - starts;
5768 endinpos = startinpos + 4;
5769 }
5770 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005771 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005773 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005775 startinpos = ((const char *)q) - starts;
5776 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005778 else {
5779 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005780 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005781 goto onError;
5782 q += 4;
5783 continue;
5784 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005785 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005786 startinpos = ((const char *)q) - starts;
5787 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005789
5790 /* The remaining input chars are ignored if the callback
5791 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005792 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005794 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005796 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005798 }
5799
Walter Dörwald41980ca2007-08-16 21:55:45 +00005800 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005802
Walter Dörwald41980ca2007-08-16 21:55:45 +00005803 Py_XDECREF(errorHandler);
5804 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005805 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005806
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005808 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005809 Py_XDECREF(errorHandler);
5810 Py_XDECREF(exc);
5811 return NULL;
5812}
5813
5814PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005815_PyUnicode_EncodeUTF32(PyObject *str,
5816 const char *errors,
5817 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005818{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005819 enum PyUnicode_Kind kind;
5820 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005821 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005822 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005823 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005824#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005825 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005826#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005827 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005828#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005829 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005830 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005831 PyObject *errorHandler = NULL;
5832 PyObject *exc = NULL;
5833 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005834
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005835 if (!PyUnicode_Check(str)) {
5836 PyErr_BadArgument();
5837 return NULL;
5838 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005839 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005840 return NULL;
5841 kind = PyUnicode_KIND(str);
5842 data = PyUnicode_DATA(str);
5843 len = PyUnicode_GET_LENGTH(str);
5844
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005845 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005846 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005847 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005848 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005849 if (v == NULL)
5850 return NULL;
5851
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005852 /* output buffer is 4-bytes aligned */
5853 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005854 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005855 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005856 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005857 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005858 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005860 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005861 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005862 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005863 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005864 else
5865 encoding = "utf-32";
5866
5867 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005868 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5869 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005870 }
5871
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005872 pos = 0;
5873 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005874 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005875
5876 if (kind == PyUnicode_2BYTE_KIND) {
5877 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5878 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005879 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005880 else {
5881 assert(kind == PyUnicode_4BYTE_KIND);
5882 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5883 &out, native_ordering);
5884 }
5885 if (pos == len)
5886 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005887
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005888 rep = unicode_encode_call_errorhandler(
5889 errors, &errorHandler,
5890 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005891 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005892 if (!rep)
5893 goto error;
5894
5895 if (PyBytes_Check(rep)) {
5896 repsize = PyBytes_GET_SIZE(rep);
5897 if (repsize & 3) {
5898 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005899 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005900 "surrogates not allowed");
5901 goto error;
5902 }
5903 moreunits = repsize / 4;
5904 }
5905 else {
5906 assert(PyUnicode_Check(rep));
5907 if (PyUnicode_READY(rep) < 0)
5908 goto error;
5909 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5910 if (!PyUnicode_IS_ASCII(rep)) {
5911 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005912 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005913 "surrogates not allowed");
5914 goto error;
5915 }
5916 }
5917
5918 /* four bytes are reserved for each surrogate */
5919 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005920 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005921 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005922 /* integer overflow */
5923 PyErr_NoMemory();
5924 goto error;
5925 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005926 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005927 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005928 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005929 }
5930
5931 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005932 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005933 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005934 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005936 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5937 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005938 }
5939
5940 Py_CLEAR(rep);
5941 }
5942
5943 /* Cut back to size actually needed. This is necessary for, for example,
5944 encoding of a string containing isolated surrogates and the 'ignore'
5945 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005946 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005947 if (nsize != PyBytes_GET_SIZE(v))
5948 _PyBytes_Resize(&v, nsize);
5949 Py_XDECREF(errorHandler);
5950 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005951 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005953 error:
5954 Py_XDECREF(rep);
5955 Py_XDECREF(errorHandler);
5956 Py_XDECREF(exc);
5957 Py_XDECREF(v);
5958 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005959}
5960
Alexander Belopolsky40018472011-02-26 01:02:56 +00005961PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005962PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5963 Py_ssize_t size,
5964 const char *errors,
5965 int byteorder)
5966{
5967 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005968 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005969 if (tmp == NULL)
5970 return NULL;
5971 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5972 Py_DECREF(tmp);
5973 return result;
5974}
5975
5976PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005977PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005978{
Victor Stinnerb960b342011-11-20 19:12:52 +01005979 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005980}
5981
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982/* --- UTF-16 Codec ------------------------------------------------------- */
5983
Tim Peters772747b2001-08-09 22:21:55 +00005984PyObject *
5985PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 Py_ssize_t size,
5987 const char *errors,
5988 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
Walter Dörwald69652032004-09-07 20:24:22 +00005990 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5991}
5992
5993PyObject *
5994PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 Py_ssize_t size,
5996 const char *errors,
5997 int *byteorder,
5998 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005999{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006001 Py_ssize_t startinpos;
6002 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006003 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006004 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006005 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006006 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006007 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 PyObject *errorHandler = NULL;
6009 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006010 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
Andy Lestere6be9b52020-02-11 20:28:35 -06006012 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006013 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014
6015 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006016 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006018 /* Check for BOM marks (U+FEFF) in the input and adjust current
6019 byte order setting accordingly. In native mode, the leading BOM
6020 mark is skipped, in all other modes, it is copied to the output
6021 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006022 if (bo == 0 && size >= 2) {
6023 const Py_UCS4 bom = (q[1] << 8) | q[0];
6024 if (bom == 0xFEFF) {
6025 q += 2;
6026 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006028 else if (bom == 0xFFFE) {
6029 q += 2;
6030 bo = 1;
6031 }
6032 if (byteorder)
6033 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Antoine Pitrou63065d72012-05-15 23:48:04 +02006036 if (q == e) {
6037 if (consumed)
6038 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006039 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006040 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006041
Christian Heimes743e0cd2012-10-17 23:52:17 +02006042#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006043 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006044 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006045#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006046 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006047 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006048#endif
Tim Peters772747b2001-08-09 22:21:55 +00006049
Antoine Pitrou63065d72012-05-15 23:48:04 +02006050 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006051 character count normally. Error handler will take care of
6052 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006053 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006054 writer.min_length = (e - q + 1) / 2;
6055 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006056 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006057
Antoine Pitrou63065d72012-05-15 23:48:04 +02006058 while (1) {
6059 Py_UCS4 ch = 0;
6060 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006062 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006063 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006064 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006065 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006066 native_ordering);
6067 else
6068 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006069 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006070 native_ordering);
6071 } else if (kind == PyUnicode_2BYTE_KIND) {
6072 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006073 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006074 native_ordering);
6075 } else {
6076 assert(kind == PyUnicode_4BYTE_KIND);
6077 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006078 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006079 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006080 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082
Antoine Pitrou63065d72012-05-15 23:48:04 +02006083 switch (ch)
6084 {
6085 case 0:
6086 /* remaining byte at the end? (size should be even) */
6087 if (q == e || consumed)
6088 goto End;
6089 errmsg = "truncated data";
6090 startinpos = ((const char *)q) - starts;
6091 endinpos = ((const char *)e) - starts;
6092 break;
6093 /* The remaining input chars are ignored if the callback
6094 chooses to skip the input */
6095 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006096 q -= 2;
6097 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006098 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006099 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006100 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006101 endinpos = ((const char *)e) - starts;
6102 break;
6103 case 2:
6104 errmsg = "illegal encoding";
6105 startinpos = ((const char *)q) - 2 - starts;
6106 endinpos = startinpos + 2;
6107 break;
6108 case 3:
6109 errmsg = "illegal UTF-16 surrogate";
6110 startinpos = ((const char *)q) - 4 - starts;
6111 endinpos = startinpos + 2;
6112 break;
6113 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006114 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006115 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 continue;
6117 }
6118
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006119 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006120 errors,
6121 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006122 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006123 &starts,
6124 (const char **)&e,
6125 &startinpos,
6126 &endinpos,
6127 &exc,
6128 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006129 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 }
6132
Antoine Pitrou63065d72012-05-15 23:48:04 +02006133End:
Walter Dörwald69652032004-09-07 20:24:22 +00006134 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 Py_XDECREF(errorHandler);
6138 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006139 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006142 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 Py_XDECREF(errorHandler);
6144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 return NULL;
6146}
6147
Tim Peters772747b2001-08-09 22:21:55 +00006148PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149_PyUnicode_EncodeUTF16(PyObject *str,
6150 const char *errors,
6151 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006153 enum PyUnicode_Kind kind;
6154 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006157 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006158 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006159#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006160 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006161#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006162 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006163#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006164 const char *encoding;
6165 Py_ssize_t nsize, pos;
6166 PyObject *errorHandler = NULL;
6167 PyObject *exc = NULL;
6168 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 if (!PyUnicode_Check(str)) {
6171 PyErr_BadArgument();
6172 return NULL;
6173 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006174 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 return NULL;
6176 kind = PyUnicode_KIND(str);
6177 data = PyUnicode_DATA(str);
6178 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006181 if (kind == PyUnicode_4BYTE_KIND) {
6182 const Py_UCS4 *in = (const Py_UCS4 *)data;
6183 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006184 while (in < end) {
6185 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006187 }
6188 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006189 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006190 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006192 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006193 nsize = len + pairs + (byteorder == 0);
6194 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006195 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006199 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006200 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006201 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006202 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006203 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006204 }
6205 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006206 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006207 }
Tim Peters772747b2001-08-09 22:21:55 +00006208
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006209 if (kind == PyUnicode_1BYTE_KIND) {
6210 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6211 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006212 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006213
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006214 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006215 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006216 }
6217 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006218 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006219 }
6220 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006221 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006222 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006223
6224 pos = 0;
6225 while (pos < len) {
6226 Py_ssize_t repsize, moreunits;
6227
6228 if (kind == PyUnicode_2BYTE_KIND) {
6229 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6230 &out, native_ordering);
6231 }
6232 else {
6233 assert(kind == PyUnicode_4BYTE_KIND);
6234 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6235 &out, native_ordering);
6236 }
6237 if (pos == len)
6238 break;
6239
6240 rep = unicode_encode_call_errorhandler(
6241 errors, &errorHandler,
6242 encoding, "surrogates not allowed",
6243 str, &exc, pos, pos + 1, &pos);
6244 if (!rep)
6245 goto error;
6246
6247 if (PyBytes_Check(rep)) {
6248 repsize = PyBytes_GET_SIZE(rep);
6249 if (repsize & 1) {
6250 raise_encode_exception(&exc, encoding,
6251 str, pos - 1, pos,
6252 "surrogates not allowed");
6253 goto error;
6254 }
6255 moreunits = repsize / 2;
6256 }
6257 else {
6258 assert(PyUnicode_Check(rep));
6259 if (PyUnicode_READY(rep) < 0)
6260 goto error;
6261 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6262 if (!PyUnicode_IS_ASCII(rep)) {
6263 raise_encode_exception(&exc, encoding,
6264 str, pos - 1, pos,
6265 "surrogates not allowed");
6266 goto error;
6267 }
6268 }
6269
6270 /* two bytes are reserved for each surrogate */
6271 if (moreunits > 1) {
6272 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006273 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006274 /* integer overflow */
6275 PyErr_NoMemory();
6276 goto error;
6277 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006278 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006279 goto error;
6280 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6281 }
6282
6283 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006284 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006285 out += moreunits;
6286 } else /* rep is unicode */ {
6287 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6288 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6289 &out, native_ordering);
6290 }
6291
6292 Py_CLEAR(rep);
6293 }
6294
6295 /* Cut back to size actually needed. This is necessary for, for example,
6296 encoding of a string containing isolated surrogates and the 'ignore' handler
6297 is used. */
6298 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6299 if (nsize != PyBytes_GET_SIZE(v))
6300 _PyBytes_Resize(&v, nsize);
6301 Py_XDECREF(errorHandler);
6302 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006303 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006304 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006305 error:
6306 Py_XDECREF(rep);
6307 Py_XDECREF(errorHandler);
6308 Py_XDECREF(exc);
6309 Py_XDECREF(v);
6310 return NULL;
6311#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312}
6313
Alexander Belopolsky40018472011-02-26 01:02:56 +00006314PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006315PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6316 Py_ssize_t size,
6317 const char *errors,
6318 int byteorder)
6319{
6320 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006321 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006322 if (tmp == NULL)
6323 return NULL;
6324 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6325 Py_DECREF(tmp);
6326 return result;
6327}
6328
6329PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006330PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006332 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333}
6334
6335/* --- Unicode Escape Codec ----------------------------------------------- */
6336
Fredrik Lundh06d12682001-01-24 07:59:11 +00006337static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006340_PyUnicode_DecodeUnicodeEscape(const char *s,
6341 Py_ssize_t size,
6342 const char *errors,
6343 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006346 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 PyObject *errorHandler = NULL;
6349 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006350
Eric V. Smith42454af2016-10-31 09:22:08 -04006351 // so we can remember if we've seen an invalid escape char or not
6352 *first_invalid_escape = NULL;
6353
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006355 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 }
6357 /* Escaped strings will always be longer than the resulting
6358 Unicode string, so we start with size here and then reduce the
6359 length after conversion to the true value.
6360 (but if the error callback returns a long replacement string
6361 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006362 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 writer.min_length = size;
6364 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6365 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006366 }
6367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 end = s + size;
6369 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006370 unsigned char c = (unsigned char) *s++;
6371 Py_UCS4 ch;
6372 int count;
6373 Py_ssize_t startinpos;
6374 Py_ssize_t endinpos;
6375 const char *message;
6376
6377#define WRITE_ASCII_CHAR(ch) \
6378 do { \
6379 assert(ch <= 127); \
6380 assert(writer.pos < writer.size); \
6381 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6382 } while(0)
6383
6384#define WRITE_CHAR(ch) \
6385 do { \
6386 if (ch <= writer.maxchar) { \
6387 assert(writer.pos < writer.size); \
6388 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6389 } \
6390 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6391 goto onError; \
6392 } \
6393 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394
6395 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 if (c != '\\') {
6397 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 continue;
6399 }
6400
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 if (s >= end) {
6404 message = "\\ at end of string";
6405 goto error;
6406 }
6407 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408
Victor Stinner62ec3312016-09-06 17:04:34 -07006409 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006410 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 case '\n': continue;
6414 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6415 case '\'': WRITE_ASCII_CHAR('\''); continue;
6416 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6417 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6420 case 't': WRITE_ASCII_CHAR('\t'); continue;
6421 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6422 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006423 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006425 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 case '0': case '1': case '2': case '3':
6430 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006432 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 ch = (ch<<3) + *s++ - '0';
6434 if (s < end && '0' <= *s && *s <= '7') {
6435 ch = (ch<<3) + *s++ - '0';
6436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 WRITE_CHAR(ch);
6439 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 /* hex escapes */
6442 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006444 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006445 message = "truncated \\xXX escape";
6446 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006451 message = "truncated \\uXXXX escape";
6452 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006455 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006457 message = "truncated \\UXXXXXXXX escape";
6458 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006460 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 ch <<= 4;
6462 if (c >= '0' && c <= '9') {
6463 ch += c - '0';
6464 }
6465 else if (c >= 'a' && c <= 'f') {
6466 ch += c - ('a' - 10);
6467 }
6468 else if (c >= 'A' && c <= 'F') {
6469 ch += c - ('A' - 10);
6470 }
6471 else {
6472 break;
6473 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006474 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006476 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006477 }
6478
6479 /* when we get here, ch is a 32-bit unicode character */
6480 if (ch > MAX_UNICODE) {
6481 message = "illegal Unicode character";
6482 goto error;
6483 }
6484
6485 WRITE_CHAR(ch);
6486 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006487
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006489 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006490 if (ucnhash_CAPI == NULL) {
6491 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006492 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6493 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006494 if (ucnhash_CAPI == NULL) {
6495 PyErr_SetString(
6496 PyExc_UnicodeError,
6497 "\\N escapes not supported (can't load unicodedata module)"
6498 );
6499 goto onError;
6500 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006501 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006502
6503 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006504 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 const char *start = ++s;
6506 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006507 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006509 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 namelen = s - start;
6511 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006512 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006513 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006514 ch = 0xffffffff; /* in case 'getcode' messes up */
6515 if (namelen <= INT_MAX &&
6516 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6517 &ch, 0)) {
6518 assert(ch <= MAX_UNICODE);
6519 WRITE_CHAR(ch);
6520 continue;
6521 }
6522 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006523 }
6524 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006525 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006526
6527 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006528 if (*first_invalid_escape == NULL) {
6529 *first_invalid_escape = s-1; /* Back up one char, since we've
6530 already incremented s. */
6531 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006532 WRITE_ASCII_CHAR('\\');
6533 WRITE_CHAR(c);
6534 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006536
6537 error:
6538 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006539 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006540 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006541 errors, &errorHandler,
6542 "unicodeescape", message,
6543 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006544 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006545 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006546 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006547 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006548
6549#undef WRITE_ASCII_CHAR
6550#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006552
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006553 Py_XDECREF(errorHandler);
6554 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006555 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006556
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006558 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559 Py_XDECREF(errorHandler);
6560 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 return NULL;
6562}
6563
Eric V. Smith42454af2016-10-31 09:22:08 -04006564PyObject *
6565PyUnicode_DecodeUnicodeEscape(const char *s,
6566 Py_ssize_t size,
6567 const char *errors)
6568{
6569 const char *first_invalid_escape;
6570 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6571 &first_invalid_escape);
6572 if (result == NULL)
6573 return NULL;
6574 if (first_invalid_escape != NULL) {
6575 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6576 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006577 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006578 Py_DECREF(result);
6579 return NULL;
6580 }
6581 }
6582 return result;
6583}
6584
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006585/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
Alexander Belopolsky40018472011-02-26 01:02:56 +00006587PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006588PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006590 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006591 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006593 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006594 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006595 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Ezio Melottie7f90372012-10-05 03:33:31 +03006597 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006598 escape.
6599
Ezio Melottie7f90372012-10-05 03:33:31 +03006600 For UCS1 strings it's '\xxx', 4 bytes per source character.
6601 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6602 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006603 */
6604
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006605 if (!PyUnicode_Check(unicode)) {
6606 PyErr_BadArgument();
6607 return NULL;
6608 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006609 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006610 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006611 }
Victor Stinner358af132015-10-12 22:36:57 +02006612
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006613 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006614 if (len == 0) {
6615 return PyBytes_FromStringAndSize(NULL, 0);
6616 }
6617
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006618 kind = PyUnicode_KIND(unicode);
6619 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006620 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6621 bytes, and 1 byte characters 4. */
6622 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006623 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 return PyErr_NoMemory();
6625 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006626 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006627 if (repr == NULL) {
6628 return NULL;
6629 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006630
Victor Stinner62ec3312016-09-06 17:04:34 -07006631 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006632 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006633 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006634
Victor Stinner62ec3312016-09-06 17:04:34 -07006635 /* U+0000-U+00ff range */
6636 if (ch < 0x100) {
6637 if (ch >= ' ' && ch < 127) {
6638 if (ch != '\\') {
6639 /* Copy printable US ASCII as-is */
6640 *p++ = (char) ch;
6641 }
6642 /* Escape backslashes */
6643 else {
6644 *p++ = '\\';
6645 *p++ = '\\';
6646 }
6647 }
Victor Stinner358af132015-10-12 22:36:57 +02006648
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 /* Map special whitespace to '\t', \n', '\r' */
6650 else if (ch == '\t') {
6651 *p++ = '\\';
6652 *p++ = 't';
6653 }
6654 else if (ch == '\n') {
6655 *p++ = '\\';
6656 *p++ = 'n';
6657 }
6658 else if (ch == '\r') {
6659 *p++ = '\\';
6660 *p++ = 'r';
6661 }
6662
6663 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6664 else {
6665 *p++ = '\\';
6666 *p++ = 'x';
6667 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6668 *p++ = Py_hexdigits[ch & 0x000F];
6669 }
Tim Petersced69f82003-09-16 20:30:58 +00006670 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006671 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006672 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 *p++ = '\\';
6674 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006675 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6676 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6677 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6678 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6681 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006682
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 /* Make sure that the first two digits are zero */
6684 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006685 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006686 *p++ = 'U';
6687 *p++ = '0';
6688 *p++ = '0';
6689 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6690 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6691 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6692 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6693 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6694 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697
Victor Stinner62ec3312016-09-06 17:04:34 -07006698 assert(p - PyBytes_AS_STRING(repr) > 0);
6699 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6700 return NULL;
6701 }
6702 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703}
6704
Alexander Belopolsky40018472011-02-26 01:02:56 +00006705PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006706PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6707 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006709 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006710 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006711 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006713 }
6714
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006715 result = PyUnicode_AsUnicodeEscapeString(tmp);
6716 Py_DECREF(tmp);
6717 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718}
6719
6720/* --- Raw Unicode Escape Codec ------------------------------------------- */
6721
Alexander Belopolsky40018472011-02-26 01:02:56 +00006722PyObject *
6723PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006724 Py_ssize_t size,
6725 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006728 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 PyObject *errorHandler = NULL;
6731 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006732
Victor Stinner62ec3312016-09-06 17:04:34 -07006733 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006734 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006735 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 /* Escaped strings will always be longer than the resulting
6738 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 length after conversion to the true value. (But decoding error
6740 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006741 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006742 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006743 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6744 goto onError;
6745 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 end = s + size;
6748 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006749 unsigned char c = (unsigned char) *s++;
6750 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006751 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006752 Py_ssize_t startinpos;
6753 Py_ssize_t endinpos;
6754 const char *message;
6755
6756#define WRITE_CHAR(ch) \
6757 do { \
6758 if (ch <= writer.maxchar) { \
6759 assert(writer.pos < writer.size); \
6760 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6761 } \
6762 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6763 goto onError; \
6764 } \
6765 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006768 if (c != '\\' || s >= end) {
6769 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006771 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006772
Victor Stinner62ec3312016-09-06 17:04:34 -07006773 c = (unsigned char) *s++;
6774 if (c == 'u') {
6775 count = 4;
6776 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006778 else if (c == 'U') {
6779 count = 8;
6780 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006781 }
6782 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006783 assert(writer.pos < writer.size);
6784 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6785 WRITE_CHAR(c);
6786 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006787 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006788 startinpos = s - starts - 2;
6789
6790 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6791 for (ch = 0; count && s < end; ++s, --count) {
6792 c = (unsigned char)*s;
6793 ch <<= 4;
6794 if (c >= '0' && c <= '9') {
6795 ch += c - '0';
6796 }
6797 else if (c >= 'a' && c <= 'f') {
6798 ch += c - ('a' - 10);
6799 }
6800 else if (c >= 'A' && c <= 'F') {
6801 ch += c - ('A' - 10);
6802 }
6803 else {
6804 break;
6805 }
6806 }
6807 if (!count) {
6808 if (ch <= MAX_UNICODE) {
6809 WRITE_CHAR(ch);
6810 continue;
6811 }
6812 message = "\\Uxxxxxxxx out of range";
6813 }
6814
6815 endinpos = s-starts;
6816 writer.min_length = end - s + writer.pos;
6817 if (unicode_decode_call_errorhandler_writer(
6818 errors, &errorHandler,
6819 "rawunicodeescape", message,
6820 &starts, &end, &startinpos, &endinpos, &exc, &s,
6821 &writer)) {
6822 goto onError;
6823 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006824 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006825
6826#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828 Py_XDECREF(errorHandler);
6829 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006830 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006831
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006833 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006834 Py_XDECREF(errorHandler);
6835 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006837
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838}
6839
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006840
Alexander Belopolsky40018472011-02-26 01:02:56 +00006841PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006842PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843{
Victor Stinner62ec3312016-09-06 17:04:34 -07006844 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006846 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006847 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006848 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006849 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006851 if (!PyUnicode_Check(unicode)) {
6852 PyErr_BadArgument();
6853 return NULL;
6854 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006855 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006856 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006857 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006858 kind = PyUnicode_KIND(unicode);
6859 data = PyUnicode_DATA(unicode);
6860 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006861 if (kind == PyUnicode_1BYTE_KIND) {
6862 return PyBytes_FromStringAndSize(data, len);
6863 }
Victor Stinner0e368262011-11-10 20:12:49 +01006864
Victor Stinner62ec3312016-09-06 17:04:34 -07006865 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6866 bytes, and 1 byte characters 4. */
6867 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868
Victor Stinner62ec3312016-09-06 17:04:34 -07006869 if (len > PY_SSIZE_T_MAX / expandsize) {
6870 return PyErr_NoMemory();
6871 }
6872 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6873 if (repr == NULL) {
6874 return NULL;
6875 }
6876 if (len == 0) {
6877 return repr;
6878 }
6879
6880 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006881 for (pos = 0; pos < len; pos++) {
6882 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006883
Victor Stinner62ec3312016-09-06 17:04:34 -07006884 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6885 if (ch < 0x100) {
6886 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006887 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006888 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006889 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 *p++ = '\\';
6891 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006892 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6893 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6894 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6895 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006897 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6898 else {
6899 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6900 *p++ = '\\';
6901 *p++ = 'U';
6902 *p++ = '0';
6903 *p++ = '0';
6904 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6905 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6906 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6907 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6908 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6909 *p++ = Py_hexdigits[ch & 15];
6910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006912
Victor Stinner62ec3312016-09-06 17:04:34 -07006913 assert(p > PyBytes_AS_STRING(repr));
6914 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6915 return NULL;
6916 }
6917 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918}
6919
Alexander Belopolsky40018472011-02-26 01:02:56 +00006920PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006921PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6922 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006924 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006925 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006926 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006927 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006928 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6929 Py_DECREF(tmp);
6930 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931}
6932
6933/* --- Latin-1 Codec ------------------------------------------------------ */
6934
Alexander Belopolsky40018472011-02-26 01:02:56 +00006935PyObject *
6936PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006937 Py_ssize_t size,
6938 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006941 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942}
6943
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006945static void
6946make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006947 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006948 PyObject *unicode,
6949 Py_ssize_t startpos, Py_ssize_t endpos,
6950 const char *reason)
6951{
6952 if (*exceptionObject == NULL) {
6953 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006954 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006955 encoding, unicode, startpos, endpos, reason);
6956 }
6957 else {
6958 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6959 goto onError;
6960 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6961 goto onError;
6962 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6963 goto onError;
6964 return;
6965 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006966 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006967 }
6968}
6969
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006970/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006971static void
6972raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006973 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006974 PyObject *unicode,
6975 Py_ssize_t startpos, Py_ssize_t endpos,
6976 const char *reason)
6977{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006978 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006979 encoding, unicode, startpos, endpos, reason);
6980 if (*exceptionObject != NULL)
6981 PyCodec_StrictErrors(*exceptionObject);
6982}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006983
6984/* error handling callback helper:
6985 build arguments, call the callback and check the arguments,
6986 put the result into newpos and return the replacement string, which
6987 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006988static PyObject *
6989unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006990 PyObject **errorHandler,
6991 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006992 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006993 Py_ssize_t startpos, Py_ssize_t endpos,
6994 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006995{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006996 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006997 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 PyObject *restuple;
6999 PyObject *resunicode;
7000
7001 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007004 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007005 }
7006
Benjamin Petersonbac79492012-01-14 13:34:47 -05007007 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007008 return NULL;
7009 len = PyUnicode_GET_LENGTH(unicode);
7010
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007011 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007012 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007015
Petr Viktorinffd97532020-02-11 17:46:57 +01007016 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007017 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007019 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007020 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 Py_DECREF(restuple);
7022 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007023 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007024 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 &resunicode, newpos)) {
7026 Py_DECREF(restuple);
7027 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007028 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007029 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7030 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7031 Py_DECREF(restuple);
7032 return NULL;
7033 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007035 *newpos = len + *newpos;
7036 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007037 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 Py_DECREF(restuple);
7039 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007041 Py_INCREF(resunicode);
7042 Py_DECREF(restuple);
7043 return resunicode;
7044}
7045
Alexander Belopolsky40018472011-02-26 01:02:56 +00007046static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007047unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007048 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007049 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007050{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007051 /* input state */
7052 Py_ssize_t pos=0, size;
7053 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007054 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 /* pointer into the output */
7056 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007057 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7058 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007059 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007061 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007062 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007063 /* output object */
7064 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065
Benjamin Petersonbac79492012-01-14 13:34:47 -05007066 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007067 return NULL;
7068 size = PyUnicode_GET_LENGTH(unicode);
7069 kind = PyUnicode_KIND(unicode);
7070 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 /* allocate enough for a simple encoding without
7072 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007073 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007074 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007075
7076 _PyBytesWriter_Init(&writer);
7077 str = _PyBytesWriter_Alloc(&writer, size);
7078 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007079 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007080
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007082 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007083
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007085 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007087 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007088 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007089 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007091 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007093 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007094 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007096
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007097 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007099
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007100 /* Only overallocate the buffer if it's not the last write */
7101 writer.overallocate = (collend < size);
7102
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007104 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007105 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007106
7107 switch (error_handler) {
7108 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007109 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007111
7112 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007113 memset(str, '?', collend - collstart);
7114 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007115 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007116 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007117 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007118 break;
Victor Stinner50149202015-09-22 00:26:54 +02007119
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007120 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007121 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007122 writer.min_size -= (collend - collstart);
7123 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007124 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007125 if (str == NULL)
7126 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007127 pos = collend;
7128 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007129
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007130 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007131 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007132 writer.min_size -= (collend - collstart);
7133 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007134 unicode, collstart, collend);
7135 if (str == NULL)
7136 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007137 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 break;
Victor Stinner50149202015-09-22 00:26:54 +02007139
Victor Stinnerc3713e92015-09-29 12:32:13 +02007140 case _Py_ERROR_SURROGATEESCAPE:
7141 for (i = collstart; i < collend; ++i) {
7142 ch = PyUnicode_READ(kind, data, i);
7143 if (ch < 0xdc80 || 0xdcff < ch) {
7144 /* Not a UTF-8b surrogate */
7145 break;
7146 }
7147 *str++ = (char)(ch - 0xdc00);
7148 ++pos;
7149 }
7150 if (i >= collend)
7151 break;
7152 collstart = pos;
7153 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007154 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007155
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007157 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7158 encoding, reason, unicode, &exc,
7159 collstart, collend, &newpos);
7160 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007162
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007163 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007164 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007165
Victor Stinner6bd525b2015-10-09 13:10:05 +02007166 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007167 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007168 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007169 PyBytes_AS_STRING(rep),
7170 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007171 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007172 else {
7173 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007174
Victor Stinner6bd525b2015-10-09 13:10:05 +02007175 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007177
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007178 if (limit == 256 ?
7179 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7180 !PyUnicode_IS_ASCII(rep))
7181 {
7182 /* Not all characters are smaller than limit */
7183 raise_encode_exception(&exc, encoding, unicode,
7184 collstart, collend, reason);
7185 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007187 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7188 str = _PyBytesWriter_WriteBytes(&writer, str,
7189 PyUnicode_DATA(rep),
7190 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007192 if (str == NULL)
7193 goto onError;
7194
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007195 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007196 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007197 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007198
7199 /* If overallocation was disabled, ensure that it was the last
7200 write. Otherwise, we missed an optimization */
7201 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007202 }
7203 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007204
Victor Stinner50149202015-09-22 00:26:54 +02007205 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007206 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007207 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007208
7209 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007210 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007211 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007212 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007213 Py_XDECREF(exc);
7214 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007215}
7216
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007217/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007218PyObject *
7219PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007220 Py_ssize_t size,
7221 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007223 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007224 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007225 if (unicode == NULL)
7226 return NULL;
7227 result = unicode_encode_ucs1(unicode, errors, 256);
7228 Py_DECREF(unicode);
7229 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230}
7231
Alexander Belopolsky40018472011-02-26 01:02:56 +00007232PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007233_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007234{
7235 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 PyErr_BadArgument();
7237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007239 if (PyUnicode_READY(unicode) == -1)
7240 return NULL;
7241 /* Fast path: if it is a one-byte string, construct
7242 bytes object directly. */
7243 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7244 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7245 PyUnicode_GET_LENGTH(unicode));
7246 /* Non-Latin-1 characters present. Defer to above function to
7247 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007248 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007249}
7250
7251PyObject*
7252PyUnicode_AsLatin1String(PyObject *unicode)
7253{
7254 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255}
7256
7257/* --- 7-bit ASCII Codec -------------------------------------------------- */
7258
Alexander Belopolsky40018472011-02-26 01:02:56 +00007259PyObject *
7260PyUnicode_DecodeASCII(const char *s,
7261 Py_ssize_t size,
7262 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007265 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007266 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007268 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007269
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007271 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007272
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007274 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007275 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007277
Inada Naoki770847a2019-06-24 12:30:24 +09007278 // Shortcut for simple case
7279 PyObject *u = PyUnicode_New(size, 127);
7280 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007281 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007282 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007283 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007284 if (outpos == size) {
7285 return u;
7286 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007287
Inada Naoki770847a2019-06-24 12:30:24 +09007288 _PyUnicodeWriter writer;
7289 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007290 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007291
Inada Naoki770847a2019-06-24 12:30:24 +09007292 s += outpos;
7293 int kind = writer.kind;
7294 void *data = writer.data;
7295 Py_ssize_t startinpos, endinpos;
7296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007297 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007298 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007300 PyUnicode_WRITE(kind, data, writer.pos, c);
7301 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007302 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007303 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007305
7306 /* byte outsize range 0x00..0x7f: call the error handler */
7307
7308 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007309 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007310
7311 switch (error_handler)
7312 {
7313 case _Py_ERROR_REPLACE:
7314 case _Py_ERROR_SURROGATEESCAPE:
7315 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007316 but we may switch to UCS2 at the first write */
7317 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7318 goto onError;
7319 kind = writer.kind;
7320 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007321
7322 if (error_handler == _Py_ERROR_REPLACE)
7323 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7324 else
7325 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7326 writer.pos++;
7327 ++s;
7328 break;
7329
7330 case _Py_ERROR_IGNORE:
7331 ++s;
7332 break;
7333
7334 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 startinpos = s-starts;
7336 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007337 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007338 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 "ascii", "ordinal not in range(128)",
7340 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007341 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007343 kind = writer.kind;
7344 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007347 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007348 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007349 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007350
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007352 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007353 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007354 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 return NULL;
7356}
7357
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007358/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007359PyObject *
7360PyUnicode_EncodeASCII(const Py_UNICODE *p,
7361 Py_ssize_t size,
7362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007364 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007365 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007366 if (unicode == NULL)
7367 return NULL;
7368 result = unicode_encode_ucs1(unicode, errors, 128);
7369 Py_DECREF(unicode);
7370 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371}
7372
Alexander Belopolsky40018472011-02-26 01:02:56 +00007373PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007374_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375{
7376 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 PyErr_BadArgument();
7378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007380 if (PyUnicode_READY(unicode) == -1)
7381 return NULL;
7382 /* Fast path: if it is an ASCII-only string, construct bytes object
7383 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007384 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007385 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7386 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007387 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007388}
7389
7390PyObject *
7391PyUnicode_AsASCIIString(PyObject *unicode)
7392{
7393 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394}
7395
Steve Dowercc16be82016-09-08 10:35:16 -07007396#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007397
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007398/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007399
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007400#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007401#define NEED_RETRY
7402#endif
7403
Steve Dower7ebdda02019-08-21 16:22:33 -07007404/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7405 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7406 both cases also and avoids partial characters overrunning the
7407 length limit in MultiByteToWideChar on Windows */
7408#define DECODING_CHUNK_SIZE (INT_MAX/4)
7409
Victor Stinner3a50e702011-10-18 21:21:00 +02007410#ifndef WC_ERR_INVALID_CHARS
7411# define WC_ERR_INVALID_CHARS 0x0080
7412#endif
7413
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007414static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007415code_page_name(UINT code_page, PyObject **obj)
7416{
7417 *obj = NULL;
7418 if (code_page == CP_ACP)
7419 return "mbcs";
7420 if (code_page == CP_UTF7)
7421 return "CP_UTF7";
7422 if (code_page == CP_UTF8)
7423 return "CP_UTF8";
7424
7425 *obj = PyBytes_FromFormat("cp%u", code_page);
7426 if (*obj == NULL)
7427 return NULL;
7428 return PyBytes_AS_STRING(*obj);
7429}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007430
Victor Stinner3a50e702011-10-18 21:21:00 +02007431static DWORD
7432decode_code_page_flags(UINT code_page)
7433{
7434 if (code_page == CP_UTF7) {
7435 /* The CP_UTF7 decoder only supports flags=0 */
7436 return 0;
7437 }
7438 else
7439 return MB_ERR_INVALID_CHARS;
7440}
7441
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 * Decode a byte string from a Windows code page into unicode object in strict
7444 * mode.
7445 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007446 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7447 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007450decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007451 wchar_t **buf,
7452 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 const char *in,
7454 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007455{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007456 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007457 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007458 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007459
7460 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007462 while ((outsize = MultiByteToWideChar(code_page, flags,
7463 in, insize, NULL, 0)) <= 0)
7464 {
7465 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7466 goto error;
7467 }
7468 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7469 flags = 0;
7470 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007472 /* Extend a wchar_t* buffer */
7473 Py_ssize_t n = *bufsize; /* Get the current length */
7474 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7475 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007477 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478
7479 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7481 if (outsize <= 0)
7482 goto error;
7483 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007484
Victor Stinner3a50e702011-10-18 21:21:00 +02007485error:
7486 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7487 return -2;
7488 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007489 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490}
7491
Victor Stinner3a50e702011-10-18 21:21:00 +02007492/*
7493 * Decode a byte string from a code page into unicode object with an error
7494 * handler.
7495 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007496 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 * UnicodeDecodeError exception and returns -1 on error.
7498 */
7499static int
7500decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007501 wchar_t **buf,
7502 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007503 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007504 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007505{
7506 const char *startin = in;
7507 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007508 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 /* Ideally, we should get reason from FormatMessage. This is the Windows
7510 2000 English version of the message. */
7511 const char *reason = "No mapping for the Unicode character exists "
7512 "in the target code page.";
7513 /* each step cannot decode more than 1 character, but a character can be
7514 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007515 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007516 int insize;
7517 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 PyObject *errorHandler = NULL;
7519 PyObject *exc = NULL;
7520 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007521 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 DWORD err;
7523 int ret = -1;
7524
7525 assert(size > 0);
7526
7527 encoding = code_page_name(code_page, &encoding_obj);
7528 if (encoding == NULL)
7529 return -1;
7530
Victor Stinner7d00cc12014-03-17 23:08:06 +01007531 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7533 UnicodeDecodeError. */
7534 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7535 if (exc != NULL) {
7536 PyCodec_StrictErrors(exc);
7537 Py_CLEAR(exc);
7538 }
7539 goto error;
7540 }
7541
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007542 /* Extend a wchar_t* buffer */
7543 Py_ssize_t n = *bufsize; /* Get the current length */
7544 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7545 PyErr_NoMemory();
7546 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007548 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7549 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007551 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007552
7553 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 while (in < endin)
7555 {
7556 /* Decode a character */
7557 insize = 1;
7558 do
7559 {
7560 outsize = MultiByteToWideChar(code_page, flags,
7561 in, insize,
7562 buffer, Py_ARRAY_LENGTH(buffer));
7563 if (outsize > 0)
7564 break;
7565 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007566 if (err == ERROR_INVALID_FLAGS && flags) {
7567 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7568 flags = 0;
7569 continue;
7570 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 if (err != ERROR_NO_UNICODE_TRANSLATION
7572 && err != ERROR_INSUFFICIENT_BUFFER)
7573 {
7574 PyErr_SetFromWindowsErr(0);
7575 goto error;
7576 }
7577 insize++;
7578 }
7579 /* 4=maximum length of a UTF-8 sequence */
7580 while (insize <= 4 && (in + insize) <= endin);
7581
7582 if (outsize <= 0) {
7583 Py_ssize_t startinpos, endinpos, outpos;
7584
Victor Stinner7d00cc12014-03-17 23:08:06 +01007585 /* last character in partial decode? */
7586 if (in + insize >= endin && !final)
7587 break;
7588
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 startinpos = in - startin;
7590 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007591 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007592 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 errors, &errorHandler,
7594 encoding, reason,
7595 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007596 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007597 {
7598 goto error;
7599 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007600 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 }
7602 else {
7603 in += insize;
7604 memcpy(out, buffer, outsize * sizeof(wchar_t));
7605 out += outsize;
7606 }
7607 }
7608
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007609 /* Shrink the buffer */
7610 assert(out - *buf <= *bufsize);
7611 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007612 /* (in - startin) <= size and size is an int */
7613 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007614
7615error:
7616 Py_XDECREF(encoding_obj);
7617 Py_XDECREF(errorHandler);
7618 Py_XDECREF(exc);
7619 return ret;
7620}
7621
Victor Stinner3a50e702011-10-18 21:21:00 +02007622static PyObject *
7623decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007624 const char *s, Py_ssize_t size,
7625 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007626{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007627 wchar_t *buf = NULL;
7628 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007629 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 if (code_page < 0) {
7632 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7633 return NULL;
7634 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007635 if (size < 0) {
7636 PyErr_BadInternalCall();
7637 return NULL;
7638 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007639
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007640 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642
Victor Stinner76a31a62011-11-04 00:05:13 +01007643 do
7644 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007646 if (size > DECODING_CHUNK_SIZE) {
7647 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007648 final = 0;
7649 done = 0;
7650 }
7651 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007652#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007653 {
7654 chunk_size = (int)size;
7655 final = (consumed == NULL);
7656 done = 1;
7657 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007658
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007660 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007661 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007662 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007663 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007664
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007665 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007666 s, chunk_size);
7667 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007668 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007669 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007670 errors, final);
7671 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007672
7673 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007674 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007675 return NULL;
7676 }
7677
7678 if (consumed)
7679 *consumed += converted;
7680
7681 s += converted;
7682 size -= converted;
7683 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007684
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007685 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7686 PyMem_Free(buf);
7687 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007688}
7689
Alexander Belopolsky40018472011-02-26 01:02:56 +00007690PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007691PyUnicode_DecodeCodePageStateful(int code_page,
7692 const char *s,
7693 Py_ssize_t size,
7694 const char *errors,
7695 Py_ssize_t *consumed)
7696{
7697 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7698}
7699
7700PyObject *
7701PyUnicode_DecodeMBCSStateful(const char *s,
7702 Py_ssize_t size,
7703 const char *errors,
7704 Py_ssize_t *consumed)
7705{
7706 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7707}
7708
7709PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007710PyUnicode_DecodeMBCS(const char *s,
7711 Py_ssize_t size,
7712 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007713{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007714 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7715}
7716
Victor Stinner3a50e702011-10-18 21:21:00 +02007717static DWORD
7718encode_code_page_flags(UINT code_page, const char *errors)
7719{
7720 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007721 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 }
7723 else if (code_page == CP_UTF7) {
7724 /* CP_UTF7 only supports flags=0 */
7725 return 0;
7726 }
7727 else {
7728 if (errors != NULL && strcmp(errors, "replace") == 0)
7729 return 0;
7730 else
7731 return WC_NO_BEST_FIT_CHARS;
7732 }
7733}
7734
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007735/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 * Encode a Unicode string to a Windows code page into a byte string in strict
7737 * mode.
7738 *
7739 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007740 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007741 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007742static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007743encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007746{
Victor Stinner554f3f02010-06-16 23:33:54 +00007747 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007748 BOOL *pusedDefaultChar = &usedDefaultChar;
7749 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007750 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007751 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 const DWORD flags = encode_code_page_flags(code_page, NULL);
7753 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007754 /* Create a substring so that we can get the UTF-16 representation
7755 of just the slice under consideration. */
7756 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007757
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007759
Victor Stinner3a50e702011-10-18 21:21:00 +02007760 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007761 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007763 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007764
Victor Stinner2fc507f2011-11-04 20:06:39 +01007765 substring = PyUnicode_Substring(unicode, offset, offset+len);
7766 if (substring == NULL)
7767 return -1;
7768 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7769 if (p == NULL) {
7770 Py_DECREF(substring);
7771 return -1;
7772 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007773 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007774
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007775 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007776 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007777 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007778 NULL, 0,
7779 NULL, pusedDefaultChar);
7780 if (outsize <= 0)
7781 goto error;
7782 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007783 if (pusedDefaultChar && *pusedDefaultChar) {
7784 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007785 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007786 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007787
Victor Stinner3a50e702011-10-18 21:21:00 +02007788 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007790 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007791 if (*outbytes == NULL) {
7792 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007794 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007795 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007796 }
7797 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007799 const Py_ssize_t n = PyBytes_Size(*outbytes);
7800 if (outsize > PY_SSIZE_T_MAX - n) {
7801 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007802 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007804 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007805 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7806 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007807 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007808 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007809 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007810 }
7811
7812 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007813 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007814 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007815 out, outsize,
7816 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007817 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007818 if (outsize <= 0)
7819 goto error;
7820 if (pusedDefaultChar && *pusedDefaultChar)
7821 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007822 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007823
Victor Stinner3a50e702011-10-18 21:21:00 +02007824error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007825 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007826 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7827 return -2;
7828 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007829 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007830}
7831
Victor Stinner3a50e702011-10-18 21:21:00 +02007832/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007833 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007834 * error handler.
7835 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007836 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007837 * -1 on other error.
7838 */
7839static int
7840encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007841 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007842 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007843{
Victor Stinner3a50e702011-10-18 21:21:00 +02007844 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007845 Py_ssize_t pos = unicode_offset;
7846 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007847 /* Ideally, we should get reason from FormatMessage. This is the Windows
7848 2000 English version of the message. */
7849 const char *reason = "invalid character";
7850 /* 4=maximum length of a UTF-8 sequence */
7851 char buffer[4];
7852 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7853 Py_ssize_t outsize;
7854 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007855 PyObject *errorHandler = NULL;
7856 PyObject *exc = NULL;
7857 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007858 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007859 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007860 PyObject *rep;
7861 int ret = -1;
7862
7863 assert(insize > 0);
7864
7865 encoding = code_page_name(code_page, &encoding_obj);
7866 if (encoding == NULL)
7867 return -1;
7868
7869 if (errors == NULL || strcmp(errors, "strict") == 0) {
7870 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7871 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007872 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007873 if (exc != NULL) {
7874 PyCodec_StrictErrors(exc);
7875 Py_DECREF(exc);
7876 }
7877 Py_XDECREF(encoding_obj);
7878 return -1;
7879 }
7880
7881 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7882 pusedDefaultChar = &usedDefaultChar;
7883 else
7884 pusedDefaultChar = NULL;
7885
7886 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7887 PyErr_NoMemory();
7888 goto error;
7889 }
7890 outsize = insize * Py_ARRAY_LENGTH(buffer);
7891
7892 if (*outbytes == NULL) {
7893 /* Create string object */
7894 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7895 if (*outbytes == NULL)
7896 goto error;
7897 out = PyBytes_AS_STRING(*outbytes);
7898 }
7899 else {
7900 /* Extend string object */
7901 Py_ssize_t n = PyBytes_Size(*outbytes);
7902 if (n > PY_SSIZE_T_MAX - outsize) {
7903 PyErr_NoMemory();
7904 goto error;
7905 }
7906 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7907 goto error;
7908 out = PyBytes_AS_STRING(*outbytes) + n;
7909 }
7910
7911 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007912 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007913 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007914 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7915 wchar_t chars[2];
7916 int charsize;
7917 if (ch < 0x10000) {
7918 chars[0] = (wchar_t)ch;
7919 charsize = 1;
7920 }
7921 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007922 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7923 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007924 charsize = 2;
7925 }
7926
Victor Stinner3a50e702011-10-18 21:21:00 +02007927 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007928 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007929 buffer, Py_ARRAY_LENGTH(buffer),
7930 NULL, pusedDefaultChar);
7931 if (outsize > 0) {
7932 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7933 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007934 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007935 memcpy(out, buffer, outsize);
7936 out += outsize;
7937 continue;
7938 }
7939 }
7940 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7941 PyErr_SetFromWindowsErr(0);
7942 goto error;
7943 }
7944
Victor Stinner3a50e702011-10-18 21:21:00 +02007945 rep = unicode_encode_call_errorhandler(
7946 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007947 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007948 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007949 if (rep == NULL)
7950 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007951 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007952
7953 if (PyBytes_Check(rep)) {
7954 outsize = PyBytes_GET_SIZE(rep);
7955 if (outsize != 1) {
7956 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7957 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7958 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7959 Py_DECREF(rep);
7960 goto error;
7961 }
7962 out = PyBytes_AS_STRING(*outbytes) + offset;
7963 }
7964 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7965 out += outsize;
7966 }
7967 else {
7968 Py_ssize_t i;
7969 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007970 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007971
Benjamin Petersonbac79492012-01-14 13:34:47 -05007972 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007973 Py_DECREF(rep);
7974 goto error;
7975 }
7976
7977 outsize = PyUnicode_GET_LENGTH(rep);
7978 if (outsize != 1) {
7979 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7980 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7981 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7982 Py_DECREF(rep);
7983 goto error;
7984 }
7985 out = PyBytes_AS_STRING(*outbytes) + offset;
7986 }
7987 kind = PyUnicode_KIND(rep);
7988 data = PyUnicode_DATA(rep);
7989 for (i=0; i < outsize; i++) {
7990 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7991 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007992 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007993 encoding, unicode,
7994 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007995 "unable to encode error handler result to ASCII");
7996 Py_DECREF(rep);
7997 goto error;
7998 }
7999 *out = (unsigned char)ch;
8000 out++;
8001 }
8002 }
8003 Py_DECREF(rep);
8004 }
8005 /* write a NUL byte */
8006 *out = 0;
8007 outsize = out - PyBytes_AS_STRING(*outbytes);
8008 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8009 if (_PyBytes_Resize(outbytes, outsize) < 0)
8010 goto error;
8011 ret = 0;
8012
8013error:
8014 Py_XDECREF(encoding_obj);
8015 Py_XDECREF(errorHandler);
8016 Py_XDECREF(exc);
8017 return ret;
8018}
8019
Victor Stinner3a50e702011-10-18 21:21:00 +02008020static PyObject *
8021encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008022 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008023 const char *errors)
8024{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008025 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008026 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008027 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008028 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008029
Victor Stinner29dacf22015-01-26 16:41:32 +01008030 if (!PyUnicode_Check(unicode)) {
8031 PyErr_BadArgument();
8032 return NULL;
8033 }
8034
Benjamin Petersonbac79492012-01-14 13:34:47 -05008035 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008036 return NULL;
8037 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008038
Victor Stinner3a50e702011-10-18 21:21:00 +02008039 if (code_page < 0) {
8040 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8041 return NULL;
8042 }
8043
Martin v. Löwis3d325192011-11-04 18:23:06 +01008044 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008045 return PyBytes_FromStringAndSize(NULL, 0);
8046
Victor Stinner7581cef2011-11-03 22:32:33 +01008047 offset = 0;
8048 do
8049 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008050#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008051 if (len > DECODING_CHUNK_SIZE) {
8052 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008053 done = 0;
8054 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008055 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008056#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008057 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008058 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008059 done = 1;
8060 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008061
Victor Stinner76a31a62011-11-04 00:05:13 +01008062 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008063 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008064 errors);
8065 if (ret == -2)
8066 ret = encode_code_page_errors(code_page, &outbytes,
8067 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008068 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008069 if (ret < 0) {
8070 Py_XDECREF(outbytes);
8071 return NULL;
8072 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008073
Victor Stinner7581cef2011-11-03 22:32:33 +01008074 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008075 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008076 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008077
Victor Stinner3a50e702011-10-18 21:21:00 +02008078 return outbytes;
8079}
8080
8081PyObject *
8082PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8083 Py_ssize_t size,
8084 const char *errors)
8085{
Victor Stinner7581cef2011-11-03 22:32:33 +01008086 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008087 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008088 if (unicode == NULL)
8089 return NULL;
8090 res = encode_code_page(CP_ACP, unicode, errors);
8091 Py_DECREF(unicode);
8092 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008093}
8094
8095PyObject *
8096PyUnicode_EncodeCodePage(int code_page,
8097 PyObject *unicode,
8098 const char *errors)
8099{
Victor Stinner7581cef2011-11-03 22:32:33 +01008100 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008101}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008102
Alexander Belopolsky40018472011-02-26 01:02:56 +00008103PyObject *
8104PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008105{
Victor Stinner7581cef2011-11-03 22:32:33 +01008106 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008107}
8108
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008109#undef NEED_RETRY
8110
Steve Dowercc16be82016-09-08 10:35:16 -07008111#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008112
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113/* --- Character Mapping Codec -------------------------------------------- */
8114
Victor Stinnerfb161b12013-04-18 01:44:27 +02008115static int
8116charmap_decode_string(const char *s,
8117 Py_ssize_t size,
8118 PyObject *mapping,
8119 const char *errors,
8120 _PyUnicodeWriter *writer)
8121{
8122 const char *starts = s;
8123 const char *e;
8124 Py_ssize_t startinpos, endinpos;
8125 PyObject *errorHandler = NULL, *exc = NULL;
8126 Py_ssize_t maplen;
8127 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008128 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008129 Py_UCS4 x;
8130 unsigned char ch;
8131
8132 if (PyUnicode_READY(mapping) == -1)
8133 return -1;
8134
8135 maplen = PyUnicode_GET_LENGTH(mapping);
8136 mapdata = PyUnicode_DATA(mapping);
8137 mapkind = PyUnicode_KIND(mapping);
8138
8139 e = s + size;
8140
8141 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8142 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8143 * is disabled in encoding aliases, latin1 is preferred because
8144 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008145 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008146 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8147 Py_UCS4 maxchar = writer->maxchar;
8148
8149 assert (writer->kind == PyUnicode_1BYTE_KIND);
8150 while (s < e) {
8151 ch = *s;
8152 x = mapdata_ucs1[ch];
8153 if (x > maxchar) {
8154 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8155 goto onError;
8156 maxchar = writer->maxchar;
8157 outdata = (Py_UCS1 *)writer->data;
8158 }
8159 outdata[writer->pos] = x;
8160 writer->pos++;
8161 ++s;
8162 }
8163 return 0;
8164 }
8165
8166 while (s < e) {
8167 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8168 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008169 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008170 if (outkind == PyUnicode_1BYTE_KIND) {
8171 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8172 Py_UCS4 maxchar = writer->maxchar;
8173 while (s < e) {
8174 ch = *s;
8175 x = mapdata_ucs2[ch];
8176 if (x > maxchar)
8177 goto Error;
8178 outdata[writer->pos] = x;
8179 writer->pos++;
8180 ++s;
8181 }
8182 break;
8183 }
8184 else if (outkind == PyUnicode_2BYTE_KIND) {
8185 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8186 while (s < e) {
8187 ch = *s;
8188 x = mapdata_ucs2[ch];
8189 if (x == 0xFFFE)
8190 goto Error;
8191 outdata[writer->pos] = x;
8192 writer->pos++;
8193 ++s;
8194 }
8195 break;
8196 }
8197 }
8198 ch = *s;
8199
8200 if (ch < maplen)
8201 x = PyUnicode_READ(mapkind, mapdata, ch);
8202 else
8203 x = 0xfffe; /* invalid value */
8204Error:
8205 if (x == 0xfffe)
8206 {
8207 /* undefined mapping */
8208 startinpos = s-starts;
8209 endinpos = startinpos+1;
8210 if (unicode_decode_call_errorhandler_writer(
8211 errors, &errorHandler,
8212 "charmap", "character maps to <undefined>",
8213 &starts, &e, &startinpos, &endinpos, &exc, &s,
8214 writer)) {
8215 goto onError;
8216 }
8217 continue;
8218 }
8219
8220 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8221 goto onError;
8222 ++s;
8223 }
8224 Py_XDECREF(errorHandler);
8225 Py_XDECREF(exc);
8226 return 0;
8227
8228onError:
8229 Py_XDECREF(errorHandler);
8230 Py_XDECREF(exc);
8231 return -1;
8232}
8233
8234static int
8235charmap_decode_mapping(const char *s,
8236 Py_ssize_t size,
8237 PyObject *mapping,
8238 const char *errors,
8239 _PyUnicodeWriter *writer)
8240{
8241 const char *starts = s;
8242 const char *e;
8243 Py_ssize_t startinpos, endinpos;
8244 PyObject *errorHandler = NULL, *exc = NULL;
8245 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008246 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008247
8248 e = s + size;
8249
8250 while (s < e) {
8251 ch = *s;
8252
8253 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8254 key = PyLong_FromLong((long)ch);
8255 if (key == NULL)
8256 goto onError;
8257
8258 item = PyObject_GetItem(mapping, key);
8259 Py_DECREF(key);
8260 if (item == NULL) {
8261 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8262 /* No mapping found means: mapping is undefined. */
8263 PyErr_Clear();
8264 goto Undefined;
8265 } else
8266 goto onError;
8267 }
8268
8269 /* Apply mapping */
8270 if (item == Py_None)
8271 goto Undefined;
8272 if (PyLong_Check(item)) {
8273 long value = PyLong_AS_LONG(item);
8274 if (value == 0xFFFE)
8275 goto Undefined;
8276 if (value < 0 || value > MAX_UNICODE) {
8277 PyErr_Format(PyExc_TypeError,
8278 "character mapping must be in range(0x%lx)",
8279 (unsigned long)MAX_UNICODE + 1);
8280 goto onError;
8281 }
8282
8283 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8284 goto onError;
8285 }
8286 else if (PyUnicode_Check(item)) {
8287 if (PyUnicode_READY(item) == -1)
8288 goto onError;
8289 if (PyUnicode_GET_LENGTH(item) == 1) {
8290 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8291 if (value == 0xFFFE)
8292 goto Undefined;
8293 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8294 goto onError;
8295 }
8296 else {
8297 writer->overallocate = 1;
8298 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8299 goto onError;
8300 }
8301 }
8302 else {
8303 /* wrong return value */
8304 PyErr_SetString(PyExc_TypeError,
8305 "character mapping must return integer, None or str");
8306 goto onError;
8307 }
8308 Py_CLEAR(item);
8309 ++s;
8310 continue;
8311
8312Undefined:
8313 /* undefined mapping */
8314 Py_CLEAR(item);
8315 startinpos = s-starts;
8316 endinpos = startinpos+1;
8317 if (unicode_decode_call_errorhandler_writer(
8318 errors, &errorHandler,
8319 "charmap", "character maps to <undefined>",
8320 &starts, &e, &startinpos, &endinpos, &exc, &s,
8321 writer)) {
8322 goto onError;
8323 }
8324 }
8325 Py_XDECREF(errorHandler);
8326 Py_XDECREF(exc);
8327 return 0;
8328
8329onError:
8330 Py_XDECREF(item);
8331 Py_XDECREF(errorHandler);
8332 Py_XDECREF(exc);
8333 return -1;
8334}
8335
Alexander Belopolsky40018472011-02-26 01:02:56 +00008336PyObject *
8337PyUnicode_DecodeCharmap(const char *s,
8338 Py_ssize_t size,
8339 PyObject *mapping,
8340 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008342 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008343
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 /* Default to Latin-1 */
8345 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008349 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008350 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008351 writer.min_length = size;
8352 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008354
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008355 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008356 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8357 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008358 }
8359 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008360 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8361 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008363 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008364
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008366 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 return NULL;
8368}
8369
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370/* Charmap encoding: the lookup table */
8371
Alexander Belopolsky40018472011-02-26 01:02:56 +00008372struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 PyObject_HEAD
8374 unsigned char level1[32];
8375 int count2, count3;
8376 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377};
8378
8379static PyObject*
8380encoding_map_size(PyObject *obj, PyObject* args)
8381{
8382 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008383 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385}
8386
8387static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008388 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 PyDoc_STR("Return the size (in bytes) of this object") },
8390 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008391};
8392
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008393static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 "EncodingMap", /*tp_name*/
8396 sizeof(struct encoding_map), /*tp_basicsize*/
8397 0, /*tp_itemsize*/
8398 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008399 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008400 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 0, /*tp_getattr*/
8402 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008403 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 0, /*tp_repr*/
8405 0, /*tp_as_number*/
8406 0, /*tp_as_sequence*/
8407 0, /*tp_as_mapping*/
8408 0, /*tp_hash*/
8409 0, /*tp_call*/
8410 0, /*tp_str*/
8411 0, /*tp_getattro*/
8412 0, /*tp_setattro*/
8413 0, /*tp_as_buffer*/
8414 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8415 0, /*tp_doc*/
8416 0, /*tp_traverse*/
8417 0, /*tp_clear*/
8418 0, /*tp_richcompare*/
8419 0, /*tp_weaklistoffset*/
8420 0, /*tp_iter*/
8421 0, /*tp_iternext*/
8422 encoding_map_methods, /*tp_methods*/
8423 0, /*tp_members*/
8424 0, /*tp_getset*/
8425 0, /*tp_base*/
8426 0, /*tp_dict*/
8427 0, /*tp_descr_get*/
8428 0, /*tp_descr_set*/
8429 0, /*tp_dictoffset*/
8430 0, /*tp_init*/
8431 0, /*tp_alloc*/
8432 0, /*tp_new*/
8433 0, /*tp_free*/
8434 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008435};
8436
8437PyObject*
8438PyUnicode_BuildEncodingMap(PyObject* string)
8439{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008440 PyObject *result;
8441 struct encoding_map *mresult;
8442 int i;
8443 int need_dict = 0;
8444 unsigned char level1[32];
8445 unsigned char level2[512];
8446 unsigned char *mlevel1, *mlevel2, *mlevel3;
8447 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008449 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008450 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008452
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008453 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008454 PyErr_BadArgument();
8455 return NULL;
8456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 kind = PyUnicode_KIND(string);
8458 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008459 length = PyUnicode_GET_LENGTH(string);
8460 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008461 memset(level1, 0xFF, sizeof level1);
8462 memset(level2, 0xFF, sizeof level2);
8463
8464 /* If there isn't a one-to-one mapping of NULL to \0,
8465 or if there are non-BMP characters, we need to use
8466 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008468 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008469 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008470 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 ch = PyUnicode_READ(kind, data, i);
8472 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008473 need_dict = 1;
8474 break;
8475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008477 /* unmapped character */
8478 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 l1 = ch >> 11;
8480 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008481 if (level1[l1] == 0xFF)
8482 level1[l1] = count2++;
8483 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008485 }
8486
8487 if (count2 >= 0xFF || count3 >= 0xFF)
8488 need_dict = 1;
8489
8490 if (need_dict) {
8491 PyObject *result = PyDict_New();
8492 PyObject *key, *value;
8493 if (!result)
8494 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008495 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008497 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008498 if (!key || !value)
8499 goto failed1;
8500 if (PyDict_SetItem(result, key, value) == -1)
8501 goto failed1;
8502 Py_DECREF(key);
8503 Py_DECREF(value);
8504 }
8505 return result;
8506 failed1:
8507 Py_XDECREF(key);
8508 Py_XDECREF(value);
8509 Py_DECREF(result);
8510 return NULL;
8511 }
8512
8513 /* Create a three-level trie */
8514 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8515 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008516 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008517 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008518 }
8519
8520 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008521 mresult = (struct encoding_map*)result;
8522 mresult->count2 = count2;
8523 mresult->count3 = count3;
8524 mlevel1 = mresult->level1;
8525 mlevel2 = mresult->level23;
8526 mlevel3 = mresult->level23 + 16*count2;
8527 memcpy(mlevel1, level1, 32);
8528 memset(mlevel2, 0xFF, 16*count2);
8529 memset(mlevel3, 0, 128*count3);
8530 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008531 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008532 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008533 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8534 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008535 /* unmapped character */
8536 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008537 o1 = ch>>11;
8538 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008539 i2 = 16*mlevel1[o1] + o2;
8540 if (mlevel2[i2] == 0xFF)
8541 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008542 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008543 i3 = 128*mlevel2[i2] + o3;
8544 mlevel3[i3] = i;
8545 }
8546 return result;
8547}
8548
8549static int
Victor Stinner22168992011-11-20 17:09:18 +01008550encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008551{
8552 struct encoding_map *map = (struct encoding_map*)mapping;
8553 int l1 = c>>11;
8554 int l2 = (c>>7) & 0xF;
8555 int l3 = c & 0x7F;
8556 int i;
8557
Victor Stinner22168992011-11-20 17:09:18 +01008558 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008560 if (c == 0)
8561 return 0;
8562 /* level 1*/
8563 i = map->level1[l1];
8564 if (i == 0xFF) {
8565 return -1;
8566 }
8567 /* level 2*/
8568 i = map->level23[16*i+l2];
8569 if (i == 0xFF) {
8570 return -1;
8571 }
8572 /* level 3 */
8573 i = map->level23[16*map->count2 + 128*i + l3];
8574 if (i == 0) {
8575 return -1;
8576 }
8577 return i;
8578}
8579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580/* Lookup the character ch in the mapping. If the character
8581 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008582 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008583static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008584charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585{
Christian Heimes217cfd12007-12-02 14:31:20 +00008586 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 PyObject *x;
8588
8589 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008591 x = PyObject_GetItem(mapping, w);
8592 Py_DECREF(w);
8593 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8595 /* No mapping found means: mapping is undefined. */
8596 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008597 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 } else
8599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008601 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008603 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 long value = PyLong_AS_LONG(x);
8605 if (value < 0 || value > 255) {
8606 PyErr_SetString(PyExc_TypeError,
8607 "character mapping must be in range(256)");
8608 Py_DECREF(x);
8609 return NULL;
8610 }
8611 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008613 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 /* wrong return value */
8617 PyErr_Format(PyExc_TypeError,
8618 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008619 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 Py_DECREF(x);
8621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 }
8623}
8624
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008625static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008626charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008627{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8629 /* exponentially overallocate to minimize reallocations */
8630 if (requiredsize < 2*outsize)
8631 requiredsize = 2*outsize;
8632 if (_PyBytes_Resize(outobj, requiredsize))
8633 return -1;
8634 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008635}
8636
Benjamin Peterson14339b62009-01-31 16:36:08 +00008637typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008641 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 space is available. Return a new reference to the object that
8643 was put in the output buffer, or Py_None, if the mapping was undefined
8644 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008645 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008646static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008647charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008648 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008650 PyObject *rep;
8651 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008652 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653
Andy Lesterdffe4c02020-03-04 07:15:20 -06008654 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008655 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008657 if (res == -1)
8658 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 if (outsize<requiredsize)
8660 if (charmapencode_resize(outobj, outpos, requiredsize))
8661 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008662 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 outstart[(*outpos)++] = (char)res;
8664 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008665 }
8666
8667 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008670 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 Py_DECREF(rep);
8672 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008673 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 if (PyLong_Check(rep)) {
8675 Py_ssize_t requiredsize = *outpos+1;
8676 if (outsize<requiredsize)
8677 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8678 Py_DECREF(rep);
8679 return enc_EXCEPTION;
8680 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008681 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008683 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 else {
8685 const char *repchars = PyBytes_AS_STRING(rep);
8686 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8687 Py_ssize_t requiredsize = *outpos+repsize;
8688 if (outsize<requiredsize)
8689 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8690 Py_DECREF(rep);
8691 return enc_EXCEPTION;
8692 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008693 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 memcpy(outstart + *outpos, repchars, repsize);
8695 *outpos += repsize;
8696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008698 Py_DECREF(rep);
8699 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700}
8701
8702/* handle an error in PyUnicode_EncodeCharmap
8703 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008704static int
8705charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008706 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008708 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008709 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710{
8711 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008712 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008713 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008714 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008715 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008716 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008718 Py_ssize_t collstartpos = *inpos;
8719 Py_ssize_t collendpos = *inpos+1;
8720 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008721 const char *encoding = "charmap";
8722 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008723 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008724 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008725 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008726
Benjamin Petersonbac79492012-01-14 13:34:47 -05008727 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008728 return -1;
8729 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 /* find all unencodable characters */
8731 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008732 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008733 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008734 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008735 val = encoding_map_lookup(ch, mapping);
8736 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 break;
8738 ++collendpos;
8739 continue;
8740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008741
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008742 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8743 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 if (rep==NULL)
8745 return -1;
8746 else if (rep!=Py_None) {
8747 Py_DECREF(rep);
8748 break;
8749 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008750 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008751 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752 }
8753 /* cache callback name lookup
8754 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008755 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008756 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008757
8758 switch (*error_handler) {
8759 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008760 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008761 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008762
8763 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008764 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 x = charmapencode_output('?', mapping, res, respos);
8766 if (x==enc_EXCEPTION) {
8767 return -1;
8768 }
8769 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008770 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 return -1;
8772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008773 }
8774 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008775 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008776 *inpos = collendpos;
8777 break;
Victor Stinner50149202015-09-22 00:26:54 +02008778
8779 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008780 /* generate replacement (temporarily (mis)uses p) */
8781 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 char buffer[2+29+1+1];
8783 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008784 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 for (cp = buffer; *cp; ++cp) {
8786 x = charmapencode_output(*cp, mapping, res, respos);
8787 if (x==enc_EXCEPTION)
8788 return -1;
8789 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008790 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 return -1;
8792 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008793 }
8794 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008795 *inpos = collendpos;
8796 break;
Victor Stinner50149202015-09-22 00:26:54 +02008797
Benjamin Peterson14339b62009-01-31 16:36:08 +00008798 default:
Victor Stinner50149202015-09-22 00:26:54 +02008799 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008800 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008802 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008804 if (PyBytes_Check(repunicode)) {
8805 /* Directly copy bytes result to output. */
8806 Py_ssize_t outsize = PyBytes_Size(*res);
8807 Py_ssize_t requiredsize;
8808 repsize = PyBytes_Size(repunicode);
8809 requiredsize = *respos + repsize;
8810 if (requiredsize > outsize)
8811 /* Make room for all additional bytes. */
8812 if (charmapencode_resize(res, respos, requiredsize)) {
8813 Py_DECREF(repunicode);
8814 return -1;
8815 }
8816 memcpy(PyBytes_AsString(*res) + *respos,
8817 PyBytes_AsString(repunicode), repsize);
8818 *respos += repsize;
8819 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008820 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008821 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008823 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008824 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008825 Py_DECREF(repunicode);
8826 return -1;
8827 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008828 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008829 data = PyUnicode_DATA(repunicode);
8830 kind = PyUnicode_KIND(repunicode);
8831 for (index = 0; index < repsize; index++) {
8832 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8833 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008835 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 return -1;
8837 }
8838 else if (x==enc_FAILED) {
8839 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008840 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 return -1;
8842 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008843 }
8844 *inpos = newpos;
8845 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008846 }
8847 return 0;
8848}
8849
Alexander Belopolsky40018472011-02-26 01:02:56 +00008850PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008851_PyUnicode_EncodeCharmap(PyObject *unicode,
8852 PyObject *mapping,
8853 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855 /* output object */
8856 PyObject *res = NULL;
8857 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008858 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008859 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008860 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008861 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008862 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008863 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008864 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008865 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008866 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867
Benjamin Petersonbac79492012-01-14 13:34:47 -05008868 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008869 return NULL;
8870 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008871 data = PyUnicode_DATA(unicode);
8872 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008873
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 /* Default to Latin-1 */
8875 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008876 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878 /* allocate enough for a simple encoding without
8879 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008880 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008881 if (res == NULL)
8882 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008883 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008887 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008889 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 if (x==enc_EXCEPTION) /* error */
8891 goto onError;
8892 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008893 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008895 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 &res, &respos)) {
8897 goto onError;
8898 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008899 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 else
8901 /* done with this character => adjust input position */
8902 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008906 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008907 if (_PyBytes_Resize(&res, respos) < 0)
8908 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008909
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008911 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008912 return res;
8913
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915 Py_XDECREF(res);
8916 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008917 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 return NULL;
8919}
8920
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008921/* Deprecated */
8922PyObject *
8923PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8924 Py_ssize_t size,
8925 PyObject *mapping,
8926 const char *errors)
8927{
8928 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008929 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008930 if (unicode == NULL)
8931 return NULL;
8932 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8933 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008934 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008935}
8936
Alexander Belopolsky40018472011-02-26 01:02:56 +00008937PyObject *
8938PyUnicode_AsCharmapString(PyObject *unicode,
8939 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940{
8941 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 PyErr_BadArgument();
8943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008945 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946}
8947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008949static void
8950make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008952 Py_ssize_t startpos, Py_ssize_t endpos,
8953 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 *exceptionObject = _PyUnicodeTranslateError_Create(
8957 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 }
8959 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8961 goto onError;
8962 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8963 goto onError;
8964 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8965 goto onError;
8966 return;
8967 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008968 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 }
8970}
8971
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972/* error handling callback helper:
8973 build arguments, call the callback and check the arguments,
8974 put the result into newpos and return the replacement string, which
8975 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008976static PyObject *
8977unicode_translate_call_errorhandler(const char *errors,
8978 PyObject **errorHandler,
8979 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008981 Py_ssize_t startpos, Py_ssize_t endpos,
8982 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008983{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008984 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008986 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008987 PyObject *restuple;
8988 PyObject *resunicode;
8989
8990 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008992 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008994 }
8995
8996 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008998 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009000
Petr Viktorinffd97532020-02-11 17:46:57 +01009001 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009002 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009004 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009005 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 Py_DECREF(restuple);
9007 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009008 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009009 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 &resunicode, &i_newpos)) {
9011 Py_DECREF(restuple);
9012 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009013 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009014 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009016 else
9017 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009019 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 Py_DECREF(restuple);
9021 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009022 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009023 Py_INCREF(resunicode);
9024 Py_DECREF(restuple);
9025 return resunicode;
9026}
9027
9028/* Lookup the character ch in the mapping and put the result in result,
9029 which must be decrefed by the caller.
9030 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009031static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033{
Christian Heimes217cfd12007-12-02 14:31:20 +00009034 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009035 PyObject *x;
9036
9037 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009039 x = PyObject_GetItem(mapping, w);
9040 Py_DECREF(w);
9041 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9043 /* No mapping found means: use 1:1 mapping. */
9044 PyErr_Clear();
9045 *result = NULL;
9046 return 0;
9047 } else
9048 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009049 }
9050 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 *result = x;
9052 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009053 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009054 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009056 if (value < 0 || value > MAX_UNICODE) {
9057 PyErr_Format(PyExc_ValueError,
9058 "character mapping must be in range(0x%x)",
9059 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 Py_DECREF(x);
9061 return -1;
9062 }
9063 *result = x;
9064 return 0;
9065 }
9066 else if (PyUnicode_Check(x)) {
9067 *result = x;
9068 return 0;
9069 }
9070 else {
9071 /* wrong return value */
9072 PyErr_SetString(PyExc_TypeError,
9073 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009074 Py_DECREF(x);
9075 return -1;
9076 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009077}
Victor Stinner1194ea02014-04-04 19:37:40 +02009078
9079/* lookup the character, write the result into the writer.
9080 Return 1 if the result was written into the writer, return 0 if the mapping
9081 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009082static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009083charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9084 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009085{
Victor Stinner1194ea02014-04-04 19:37:40 +02009086 PyObject *item;
9087
9088 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009090
9091 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009093 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009095 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009096 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009097 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009098
9099 if (item == Py_None) {
9100 Py_DECREF(item);
9101 return 0;
9102 }
9103
9104 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009105 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9106 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9107 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009108 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9109 Py_DECREF(item);
9110 return -1;
9111 }
9112 Py_DECREF(item);
9113 return 1;
9114 }
9115
9116 if (!PyUnicode_Check(item)) {
9117 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009119 }
9120
9121 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9122 Py_DECREF(item);
9123 return -1;
9124 }
9125
9126 Py_DECREF(item);
9127 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009128}
9129
Victor Stinner89a76ab2014-04-05 11:44:04 +02009130static int
9131unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9132 Py_UCS1 *translate)
9133{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009134 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009135 int ret = 0;
9136
Victor Stinner89a76ab2014-04-05 11:44:04 +02009137 if (charmaptranslate_lookup(ch, mapping, &item)) {
9138 return -1;
9139 }
9140
9141 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009142 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009143 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009144 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009145 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009146 /* not found => default to 1:1 mapping */
9147 translate[ch] = ch;
9148 return 1;
9149 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009150 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009151 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009152 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9153 used it */
9154 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009155 /* invalid character or character outside ASCII:
9156 skip the fast translate */
9157 goto exit;
9158 }
9159 translate[ch] = (Py_UCS1)replace;
9160 }
9161 else if (PyUnicode_Check(item)) {
9162 Py_UCS4 replace;
9163
9164 if (PyUnicode_READY(item) == -1) {
9165 Py_DECREF(item);
9166 return -1;
9167 }
9168 if (PyUnicode_GET_LENGTH(item) != 1)
9169 goto exit;
9170
9171 replace = PyUnicode_READ_CHAR(item, 0);
9172 if (replace > 127)
9173 goto exit;
9174 translate[ch] = (Py_UCS1)replace;
9175 }
9176 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009177 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009178 goto exit;
9179 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009180 ret = 1;
9181
Benjamin Peterson1365de72014-04-07 20:15:41 -04009182 exit:
9183 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009184 return ret;
9185}
9186
9187/* Fast path for ascii => ascii translation. Return 1 if the whole string
9188 was translated into writer, return 0 if the input string was partially
9189 translated into writer, raise an exception and return -1 on error. */
9190static int
9191unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009192 _PyUnicodeWriter *writer, int ignore,
9193 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009194{
Victor Stinner872b2912014-04-05 14:27:07 +02009195 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009196 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009197 const Py_UCS1 *in, *end;
9198 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009199 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009200
Victor Stinner89a76ab2014-04-05 11:44:04 +02009201 len = PyUnicode_GET_LENGTH(input);
9202
Victor Stinner872b2912014-04-05 14:27:07 +02009203 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009204
9205 in = PyUnicode_1BYTE_DATA(input);
9206 end = in + len;
9207
9208 assert(PyUnicode_IS_ASCII(writer->buffer));
9209 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9210 out = PyUnicode_1BYTE_DATA(writer->buffer);
9211
Victor Stinner872b2912014-04-05 14:27:07 +02009212 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009213 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009214 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009215 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009216 int translate = unicode_fast_translate_lookup(mapping, ch,
9217 ascii_table);
9218 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009219 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009220 if (translate == 0)
9221 goto exit;
9222 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009223 }
Victor Stinner872b2912014-04-05 14:27:07 +02009224 if (ch2 == 0xfe) {
9225 if (ignore)
9226 continue;
9227 goto exit;
9228 }
9229 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009230 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009231 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009232 }
Victor Stinner872b2912014-04-05 14:27:07 +02009233 res = 1;
9234
9235exit:
9236 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009237 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009238 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009239}
9240
Victor Stinner3222da22015-10-01 22:07:32 +02009241static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242_PyUnicode_TranslateCharmap(PyObject *input,
9243 PyObject *mapping,
9244 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009247 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 Py_ssize_t size, i;
9249 int kind;
9250 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009251 _PyUnicodeWriter writer;
9252 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009253 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009254 PyObject *errorHandler = NULL;
9255 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009256 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009257 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009258
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 PyErr_BadArgument();
9261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 if (PyUnicode_READY(input) == -1)
9265 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009266 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 kind = PyUnicode_KIND(input);
9268 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009270 if (size == 0)
9271 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009273 /* allocate enough for a simple 1:1 translation without
9274 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009275 _PyUnicodeWriter_Init(&writer);
9276 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009277 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278
Victor Stinner872b2912014-04-05 14:27:07 +02009279 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9280
Victor Stinner33798672016-03-01 21:59:58 +01009281 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009282 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009283 if (PyUnicode_IS_ASCII(input)) {
9284 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9285 if (res < 0) {
9286 _PyUnicodeWriter_Dealloc(&writer);
9287 return NULL;
9288 }
9289 if (res == 1)
9290 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009291 }
Victor Stinner33798672016-03-01 21:59:58 +01009292 else {
9293 i = 0;
9294 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009298 int translate;
9299 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9300 Py_ssize_t newpos;
9301 /* startpos for collecting untranslatable chars */
9302 Py_ssize_t collstart;
9303 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009304 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305
Victor Stinner1194ea02014-04-04 19:37:40 +02009306 ch = PyUnicode_READ(kind, data, i);
9307 translate = charmaptranslate_output(ch, mapping, &writer);
9308 if (translate < 0)
9309 goto onError;
9310
9311 if (translate != 0) {
9312 /* it worked => adjust input pointer */
9313 ++i;
9314 continue;
9315 }
9316
9317 /* untranslatable character */
9318 collstart = i;
9319 collend = i+1;
9320
9321 /* find all untranslatable characters */
9322 while (collend < size) {
9323 PyObject *x;
9324 ch = PyUnicode_READ(kind, data, collend);
9325 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009326 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009327 Py_XDECREF(x);
9328 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009330 ++collend;
9331 }
9332
9333 if (ignore) {
9334 i = collend;
9335 }
9336 else {
9337 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9338 reason, input, &exc,
9339 collstart, collend, &newpos);
9340 if (repunicode == NULL)
9341 goto onError;
9342 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009344 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009345 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009346 Py_DECREF(repunicode);
9347 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009348 }
9349 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009350 Py_XDECREF(exc);
9351 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009352 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009355 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009356 Py_XDECREF(exc);
9357 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358 return NULL;
9359}
9360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361/* Deprecated. Use PyUnicode_Translate instead. */
9362PyObject *
9363PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9364 Py_ssize_t size,
9365 PyObject *mapping,
9366 const char *errors)
9367{
Christian Heimes5f520f42012-09-11 14:03:25 +02009368 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009369 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 if (!unicode)
9371 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009372 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9373 Py_DECREF(unicode);
9374 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375}
9376
Alexander Belopolsky40018472011-02-26 01:02:56 +00009377PyObject *
9378PyUnicode_Translate(PyObject *str,
9379 PyObject *mapping,
9380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009382 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009383 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009384 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385}
Tim Petersced69f82003-09-16 20:30:58 +00009386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387PyObject *
9388_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9389{
9390 if (!PyUnicode_Check(unicode)) {
9391 PyErr_BadInternalCall();
9392 return NULL;
9393 }
9394 if (PyUnicode_READY(unicode) == -1)
9395 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009396 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 /* If the string is already ASCII, just return the same string */
9398 Py_INCREF(unicode);
9399 return unicode;
9400 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009401
9402 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9403 PyObject *result = PyUnicode_New(len, 127);
9404 if (result == NULL) {
9405 return NULL;
9406 }
9407
9408 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9409 int kind = PyUnicode_KIND(unicode);
9410 const void *data = PyUnicode_DATA(unicode);
9411 Py_ssize_t i;
9412 for (i = 0; i < len; ++i) {
9413 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9414 if (ch < 127) {
9415 out[i] = ch;
9416 }
9417 else if (Py_UNICODE_ISSPACE(ch)) {
9418 out[i] = ' ';
9419 }
9420 else {
9421 int decimal = Py_UNICODE_TODECIMAL(ch);
9422 if (decimal < 0) {
9423 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009424 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009425 _PyUnicode_LENGTH(result) = i + 1;
9426 break;
9427 }
9428 out[i] = '0' + decimal;
9429 }
9430 }
9431
INADA Naoki16dfca42018-07-14 12:06:43 +09009432 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009433 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434}
9435
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009436PyObject *
9437PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9438 Py_ssize_t length)
9439{
Victor Stinnerf0124502011-11-21 23:12:56 +01009440 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009441 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009442 Py_UCS4 maxchar;
9443 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009444 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009445
Victor Stinner99d7ad02012-02-22 13:37:39 +01009446 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009447 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009448 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009449 if (ch > 127) {
9450 int decimal = Py_UNICODE_TODECIMAL(ch);
9451 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009452 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009453 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009454 }
9455 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009456
9457 /* Copy to a new string */
9458 decimal = PyUnicode_New(length, maxchar);
9459 if (decimal == NULL)
9460 return decimal;
9461 kind = PyUnicode_KIND(decimal);
9462 data = PyUnicode_DATA(decimal);
9463 /* Iterate over code points */
9464 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009465 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009466 if (ch > 127) {
9467 int decimal = Py_UNICODE_TODECIMAL(ch);
9468 if (decimal >= 0)
9469 ch = '0' + decimal;
9470 }
9471 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009473 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009474}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009475/* --- Decimal Encoder ---------------------------------------------------- */
9476
Alexander Belopolsky40018472011-02-26 01:02:56 +00009477int
9478PyUnicode_EncodeDecimal(Py_UNICODE *s,
9479 Py_ssize_t length,
9480 char *output,
9481 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009482{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009483 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009484 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009485 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009486 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009487
9488 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 PyErr_BadArgument();
9490 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009491 }
9492
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009493 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009494 if (unicode == NULL)
9495 return -1;
9496
Victor Stinner42bf7752011-11-21 22:52:58 +01009497 kind = PyUnicode_KIND(unicode);
9498 data = PyUnicode_DATA(unicode);
9499
Victor Stinnerb84d7232011-11-22 01:50:07 +01009500 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009501 PyObject *exc;
9502 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009504 Py_ssize_t startpos;
9505
9506 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009507
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009509 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009510 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 decimal = Py_UNICODE_TODECIMAL(ch);
9514 if (decimal >= 0) {
9515 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009516 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 continue;
9518 }
9519 if (0 < ch && ch < 256) {
9520 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009521 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 continue;
9523 }
Victor Stinner6345be92011-11-25 20:09:01 +01009524
Victor Stinner42bf7752011-11-21 22:52:58 +01009525 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009526 exc = NULL;
9527 raise_encode_exception(&exc, "decimal", unicode,
9528 startpos, startpos+1,
9529 "invalid decimal Unicode string");
9530 Py_XDECREF(exc);
9531 Py_DECREF(unicode);
9532 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009533 }
9534 /* 0-terminate the output string */
9535 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009536 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009537 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009538}
9539
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540/* --- Helpers ------------------------------------------------------------ */
9541
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009542/* helper macro to fixup start/end slice values */
9543#define ADJUST_INDICES(start, end, len) \
9544 if (end > len) \
9545 end = len; \
9546 else if (end < 0) { \
9547 end += len; \
9548 if (end < 0) \
9549 end = 0; \
9550 } \
9551 if (start < 0) { \
9552 start += len; \
9553 if (start < 0) \
9554 start = 0; \
9555 }
9556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009558any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009560 Py_ssize_t end,
9561 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009563 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009564 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 Py_ssize_t len1, len2, result;
9566
9567 kind1 = PyUnicode_KIND(s1);
9568 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009569 if (kind1 < kind2)
9570 return -1;
9571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 len1 = PyUnicode_GET_LENGTH(s1);
9573 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009574 ADJUST_INDICES(start, end, len1);
9575 if (end - start < len2)
9576 return -1;
9577
9578 buf1 = PyUnicode_DATA(s1);
9579 buf2 = PyUnicode_DATA(s2);
9580 if (len2 == 1) {
9581 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9582 result = findchar((const char *)buf1 + kind1*start,
9583 kind1, end - start, ch, direction);
9584 if (result == -1)
9585 return -1;
9586 else
9587 return start + result;
9588 }
9589
9590 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009591 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592 if (!buf2)
9593 return -2;
9594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595
Victor Stinner794d5672011-10-10 03:21:36 +02009596 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009597 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009598 case PyUnicode_1BYTE_KIND:
9599 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9600 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9601 else
9602 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9603 break;
9604 case PyUnicode_2BYTE_KIND:
9605 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9606 break;
9607 case PyUnicode_4BYTE_KIND:
9608 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9609 break;
9610 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009611 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009612 }
9613 }
9614 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009615 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009616 case PyUnicode_1BYTE_KIND:
9617 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9618 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9619 else
9620 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9621 break;
9622 case PyUnicode_2BYTE_KIND:
9623 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9624 break;
9625 case PyUnicode_4BYTE_KIND:
9626 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9627 break;
9628 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009629 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
9632
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009633 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009634 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009635 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636
9637 return result;
9638}
9639
Victor Stinner59423e32018-11-26 13:40:01 +01009640/* _PyUnicode_InsertThousandsGrouping() helper functions */
9641#include "stringlib/localeutil.h"
9642
9643/**
9644 * InsertThousandsGrouping:
9645 * @writer: Unicode writer.
9646 * @n_buffer: Number of characters in @buffer.
9647 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9648 * @d_pos: Start of digits string.
9649 * @n_digits: The number of digits in the string, in which we want
9650 * to put the grouping chars.
9651 * @min_width: The minimum width of the digits in the output string.
9652 * Output will be zero-padded on the left to fill.
9653 * @grouping: see definition in localeconv().
9654 * @thousands_sep: see definition in localeconv().
9655 *
9656 * There are 2 modes: counting and filling. If @writer is NULL,
9657 * we are in counting mode, else filling mode.
9658 * If counting, the required buffer size is returned.
9659 * If filling, we know the buffer will be large enough, so we don't
9660 * need to pass in the buffer size.
9661 * Inserts thousand grouping characters (as defined by grouping and
9662 * thousands_sep) into @writer.
9663 *
9664 * Return value: -1 on error, number of characters otherwise.
9665 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009667_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009668 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009669 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009670 PyObject *digits,
9671 Py_ssize_t d_pos,
9672 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009673 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009674 const char *grouping,
9675 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009676 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677{
Xtreak3f7983a2019-01-07 20:39:14 +05309678 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009679 if (writer) {
9680 assert(digits != NULL);
9681 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009682 }
9683 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009684 assert(digits == NULL);
9685 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009686 }
Victor Stinner59423e32018-11-26 13:40:01 +01009687 assert(0 <= d_pos);
9688 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009689 assert(grouping != NULL);
9690
9691 if (digits != NULL) {
9692 if (PyUnicode_READY(digits) == -1) {
9693 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009694 }
Victor Stinner59423e32018-11-26 13:40:01 +01009695 }
9696 if (PyUnicode_READY(thousands_sep) == -1) {
9697 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009698 }
9699
Victor Stinner59423e32018-11-26 13:40:01 +01009700 Py_ssize_t count = 0;
9701 Py_ssize_t n_zeros;
9702 int loop_broken = 0;
9703 int use_separator = 0; /* First time through, don't append the
9704 separator. They only go between
9705 groups. */
9706 Py_ssize_t buffer_pos;
9707 Py_ssize_t digits_pos;
9708 Py_ssize_t len;
9709 Py_ssize_t n_chars;
9710 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9711 be looked at */
9712 /* A generator that returns all of the grouping widths, until it
9713 returns 0. */
9714 GroupGenerator groupgen;
9715 GroupGenerator_init(&groupgen, grouping);
9716 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9717
9718 /* if digits are not grouped, thousands separator
9719 should be an empty string */
9720 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9721
9722 digits_pos = d_pos + n_digits;
9723 if (writer) {
9724 buffer_pos = writer->pos + n_buffer;
9725 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9726 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 }
Victor Stinner59423e32018-11-26 13:40:01 +01009728 else {
9729 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009730 }
Victor Stinner59423e32018-11-26 13:40:01 +01009731
9732 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009733 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009734 }
Victor Stinner59423e32018-11-26 13:40:01 +01009735
9736 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9737 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9738 n_zeros = Py_MAX(0, len - remaining);
9739 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9740
9741 /* Use n_zero zero's and n_chars chars */
9742
9743 /* Count only, don't do anything. */
9744 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9745
9746 /* Copy into the writer. */
9747 InsertThousandsGrouping_fill(writer, &buffer_pos,
9748 digits, &digits_pos,
9749 n_chars, n_zeros,
9750 use_separator ? thousands_sep : NULL,
9751 thousands_sep_len, maxchar);
9752
9753 /* Use a separator next time. */
9754 use_separator = 1;
9755
9756 remaining -= n_chars;
9757 min_width -= len;
9758
9759 if (remaining <= 0 && min_width <= 0) {
9760 loop_broken = 1;
9761 break;
9762 }
9763 min_width -= thousands_sep_len;
9764 }
9765 if (!loop_broken) {
9766 /* We left the loop without using a break statement. */
9767
9768 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9769 n_zeros = Py_MAX(0, len - remaining);
9770 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9771
9772 /* Use n_zero zero's and n_chars chars */
9773 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9774
9775 /* Copy into the writer. */
9776 InsertThousandsGrouping_fill(writer, &buffer_pos,
9777 digits, &digits_pos,
9778 n_chars, n_zeros,
9779 use_separator ? thousands_sep : NULL,
9780 thousands_sep_len, maxchar);
9781 }
9782 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783}
9784
9785
Alexander Belopolsky40018472011-02-26 01:02:56 +00009786Py_ssize_t
9787PyUnicode_Count(PyObject *str,
9788 PyObject *substr,
9789 Py_ssize_t start,
9790 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009792 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009793 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009794 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009796
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009797 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009798 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009799
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009800 kind1 = PyUnicode_KIND(str);
9801 kind2 = PyUnicode_KIND(substr);
9802 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009803 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009804
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009805 len1 = PyUnicode_GET_LENGTH(str);
9806 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009808 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009809 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009810
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009811 buf1 = PyUnicode_DATA(str);
9812 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009813 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009814 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009815 if (!buf2)
9816 goto onError;
9817 }
9818
9819 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009821 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009822 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009823 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824 buf2, len2, PY_SSIZE_T_MAX
9825 );
9826 else
9827 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009828 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009829 buf2, len2, PY_SSIZE_T_MAX
9830 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 break;
9832 case PyUnicode_2BYTE_KIND:
9833 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009834 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 buf2, len2, PY_SSIZE_T_MAX
9836 );
9837 break;
9838 case PyUnicode_4BYTE_KIND:
9839 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009840 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 buf2, len2, PY_SSIZE_T_MAX
9842 );
9843 break;
9844 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009845 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009847
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009848 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009849 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009850 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009854 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9855 if (kind2 != kind1)
9856 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858}
9859
Alexander Belopolsky40018472011-02-26 01:02:56 +00009860Py_ssize_t
9861PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009862 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009863 Py_ssize_t start,
9864 Py_ssize_t end,
9865 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009867 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009868 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009869
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009870 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871}
9872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873Py_ssize_t
9874PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9875 Py_ssize_t start, Py_ssize_t end,
9876 int direction)
9877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009879 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 if (PyUnicode_READY(str) == -1)
9881 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009882 len = PyUnicode_GET_LENGTH(str);
9883 ADJUST_INDICES(start, end, len);
9884 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009885 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009887 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9888 kind, end-start, ch, direction);
9889 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009891 else
9892 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893}
9894
Alexander Belopolsky40018472011-02-26 01:02:56 +00009895static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009896tailmatch(PyObject *self,
9897 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009898 Py_ssize_t start,
9899 Py_ssize_t end,
9900 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 int kind_self;
9903 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009904 const void *data_self;
9905 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 Py_ssize_t offset;
9907 Py_ssize_t i;
9908 Py_ssize_t end_sub;
9909
9910 if (PyUnicode_READY(self) == -1 ||
9911 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009912 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9915 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009919 if (PyUnicode_GET_LENGTH(substring) == 0)
9920 return 1;
9921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 kind_self = PyUnicode_KIND(self);
9923 data_self = PyUnicode_DATA(self);
9924 kind_sub = PyUnicode_KIND(substring);
9925 data_sub = PyUnicode_DATA(substring);
9926 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9927
9928 if (direction > 0)
9929 offset = end;
9930 else
9931 offset = start;
9932
9933 if (PyUnicode_READ(kind_self, data_self, offset) ==
9934 PyUnicode_READ(kind_sub, data_sub, 0) &&
9935 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9936 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9937 /* If both are of the same kind, memcmp is sufficient */
9938 if (kind_self == kind_sub) {
9939 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009940 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 data_sub,
9942 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009943 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009945 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 else {
9947 /* We do not need to compare 0 and len(substring)-1 because
9948 the if statement above ensured already that they are equal
9949 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 for (i = 1; i < end_sub; ++i) {
9951 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9952 PyUnicode_READ(kind_sub, data_sub, i))
9953 return 0;
9954 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009955 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009957 }
9958
9959 return 0;
9960}
9961
Alexander Belopolsky40018472011-02-26 01:02:56 +00009962Py_ssize_t
9963PyUnicode_Tailmatch(PyObject *str,
9964 PyObject *substr,
9965 Py_ssize_t start,
9966 Py_ssize_t end,
9967 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009969 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009970 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009971
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009972 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973}
9974
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009975static PyObject *
9976ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009978 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009979 const char *data = PyUnicode_DATA(self);
9980 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009981 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009982
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009983 res = PyUnicode_New(len, 127);
9984 if (res == NULL)
9985 return NULL;
9986 resdata = PyUnicode_DATA(res);
9987 if (lower)
9988 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009990 _Py_bytes_upper(resdata, data, len);
9991 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992}
9993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009995handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009997 Py_ssize_t j;
9998 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009999 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010000 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010001
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010002 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10003
10004 where ! is a negation and \p{xxx} is a character with property xxx.
10005 */
10006 for (j = i - 1; j >= 0; j--) {
10007 c = PyUnicode_READ(kind, data, j);
10008 if (!_PyUnicode_IsCaseIgnorable(c))
10009 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010011 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10012 if (final_sigma) {
10013 for (j = i + 1; j < length; j++) {
10014 c = PyUnicode_READ(kind, data, j);
10015 if (!_PyUnicode_IsCaseIgnorable(c))
10016 break;
10017 }
10018 final_sigma = j == length || !_PyUnicode_IsCased(c);
10019 }
10020 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021}
10022
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010023static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010024lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010025 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010027 /* Obscure special case. */
10028 if (c == 0x3A3) {
10029 mapped[0] = handle_capital_sigma(kind, data, length, i);
10030 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010032 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033}
10034
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010035static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010036do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010038 Py_ssize_t i, k = 0;
10039 int n_res, j;
10040 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010041
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010042 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010043 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010044 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010045 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010046 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010048 for (i = 1; i < length; i++) {
10049 c = PyUnicode_READ(kind, data, i);
10050 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10051 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010052 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010053 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010054 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010055 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057}
10058
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010059static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010060do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010061 Py_ssize_t i, k = 0;
10062
10063 for (i = 0; i < length; i++) {
10064 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10065 int n_res, j;
10066 if (Py_UNICODE_ISUPPER(c)) {
10067 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10068 }
10069 else if (Py_UNICODE_ISLOWER(c)) {
10070 n_res = _PyUnicode_ToUpperFull(c, mapped);
10071 }
10072 else {
10073 n_res = 1;
10074 mapped[0] = c;
10075 }
10076 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010077 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010078 res[k++] = mapped[j];
10079 }
10080 }
10081 return k;
10082}
10083
10084static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010085do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010088 Py_ssize_t i, k = 0;
10089
10090 for (i = 0; i < length; i++) {
10091 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10092 int n_res, j;
10093 if (lower)
10094 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10095 else
10096 n_res = _PyUnicode_ToUpperFull(c, mapped);
10097 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010098 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010099 res[k++] = mapped[j];
10100 }
10101 }
10102 return k;
10103}
10104
10105static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010106do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010107{
10108 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10109}
10110
10111static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010112do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010113{
10114 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10115}
10116
Benjamin Petersone51757f2012-01-12 21:10:29 -050010117static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010118do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010119{
10120 Py_ssize_t i, k = 0;
10121
10122 for (i = 0; i < length; i++) {
10123 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10124 Py_UCS4 mapped[3];
10125 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10126 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010127 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010128 res[k++] = mapped[j];
10129 }
10130 }
10131 return k;
10132}
10133
10134static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010135do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010136{
10137 Py_ssize_t i, k = 0;
10138 int previous_is_cased;
10139
10140 previous_is_cased = 0;
10141 for (i = 0; i < length; i++) {
10142 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10143 Py_UCS4 mapped[3];
10144 int n_res, j;
10145
10146 if (previous_is_cased)
10147 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10148 else
10149 n_res = _PyUnicode_ToTitleFull(c, mapped);
10150
10151 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010152 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010153 res[k++] = mapped[j];
10154 }
10155
10156 previous_is_cased = _PyUnicode_IsCased(c);
10157 }
10158 return k;
10159}
10160
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010161static PyObject *
10162case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010163 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010164{
10165 PyObject *res = NULL;
10166 Py_ssize_t length, newlength = 0;
10167 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010168 const void *data;
10169 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010170 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10171
Benjamin Petersoneea48462012-01-16 14:28:50 -050010172 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010173
10174 kind = PyUnicode_KIND(self);
10175 data = PyUnicode_DATA(self);
10176 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010177 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010178 PyErr_SetString(PyExc_OverflowError, "string is too long");
10179 return NULL;
10180 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010181 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010182 if (tmp == NULL)
10183 return PyErr_NoMemory();
10184 newlength = perform(kind, data, length, tmp, &maxchar);
10185 res = PyUnicode_New(newlength, maxchar);
10186 if (res == NULL)
10187 goto leave;
10188 tmpend = tmp + newlength;
10189 outdata = PyUnicode_DATA(res);
10190 outkind = PyUnicode_KIND(res);
10191 switch (outkind) {
10192 case PyUnicode_1BYTE_KIND:
10193 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10194 break;
10195 case PyUnicode_2BYTE_KIND:
10196 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10197 break;
10198 case PyUnicode_4BYTE_KIND:
10199 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10200 break;
10201 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010202 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010203 }
10204 leave:
10205 PyMem_FREE(tmp);
10206 return res;
10207}
10208
Tim Peters8ce9f162004-08-27 01:49:32 +000010209PyObject *
10210PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010212 PyObject *res;
10213 PyObject *fseq;
10214 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010215 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010217 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010218 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010219 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010220 }
10221
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010222 /* NOTE: the following code can't call back into Python code,
10223 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010224 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010225
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010226 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010227 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010228 res = _PyUnicode_JoinArray(separator, items, seqlen);
10229 Py_DECREF(fseq);
10230 return res;
10231}
10232
10233PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010234_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010235{
10236 PyObject *res = NULL; /* the result */
10237 PyObject *sep = NULL;
10238 Py_ssize_t seplen;
10239 PyObject *item;
10240 Py_ssize_t sz, i, res_offset;
10241 Py_UCS4 maxchar;
10242 Py_UCS4 item_maxchar;
10243 int use_memcpy;
10244 unsigned char *res_data = NULL, *sep_data = NULL;
10245 PyObject *last_obj;
10246 unsigned int kind = 0;
10247
Tim Peters05eba1f2004-08-27 21:32:02 +000010248 /* If empty sequence, return u"". */
10249 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010250 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010251 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010252
Tim Peters05eba1f2004-08-27 21:32:02 +000010253 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010254 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010255 if (seqlen == 1) {
10256 if (PyUnicode_CheckExact(items[0])) {
10257 res = items[0];
10258 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010259 return res;
10260 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010261 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010262 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010263 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010264 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010265 /* Set up sep and seplen */
10266 if (separator == NULL) {
10267 /* fall back to a blank space separator */
10268 sep = PyUnicode_FromOrdinal(' ');
10269 if (!sep)
10270 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010271 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010272 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010273 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010274 else {
10275 if (!PyUnicode_Check(separator)) {
10276 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010277 "separator: expected str instance,"
10278 " %.80s found",
10279 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010280 goto onError;
10281 }
10282 if (PyUnicode_READY(separator))
10283 goto onError;
10284 sep = separator;
10285 seplen = PyUnicode_GET_LENGTH(separator);
10286 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10287 /* inc refcount to keep this code path symmetric with the
10288 above case of a blank separator */
10289 Py_INCREF(sep);
10290 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010291 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010292 }
10293
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010294 /* There are at least two things to join, or else we have a subclass
10295 * of str in the sequence.
10296 * Do a pre-pass to figure out the total amount of space we'll
10297 * need (sz), and see whether all argument are strings.
10298 */
10299 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010300#ifdef Py_DEBUG
10301 use_memcpy = 0;
10302#else
10303 use_memcpy = 1;
10304#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010305 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010306 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010307 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010308 if (!PyUnicode_Check(item)) {
10309 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010310 "sequence item %zd: expected str instance,"
10311 " %.80s found",
10312 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010313 goto onError;
10314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (PyUnicode_READY(item) == -1)
10316 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010317 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010319 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010320 if (i != 0) {
10321 add_sz += seplen;
10322 }
10323 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010324 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010326 goto onError;
10327 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010328 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010329 if (use_memcpy && last_obj != NULL) {
10330 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10331 use_memcpy = 0;
10332 }
10333 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010334 }
Tim Petersced69f82003-09-16 20:30:58 +000010335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010337 if (res == NULL)
10338 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010339
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010340 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010341#ifdef Py_DEBUG
10342 use_memcpy = 0;
10343#else
10344 if (use_memcpy) {
10345 res_data = PyUnicode_1BYTE_DATA(res);
10346 kind = PyUnicode_KIND(res);
10347 if (seplen != 0)
10348 sep_data = PyUnicode_1BYTE_DATA(sep);
10349 }
10350#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010351 if (use_memcpy) {
10352 for (i = 0; i < seqlen; ++i) {
10353 Py_ssize_t itemlen;
10354 item = items[i];
10355
10356 /* Copy item, and maybe the separator. */
10357 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010358 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010359 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010360 kind * seplen);
10361 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010362 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010363
10364 itemlen = PyUnicode_GET_LENGTH(item);
10365 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010366 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010367 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010368 kind * itemlen);
10369 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010370 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010371 }
10372 assert(res_data == PyUnicode_1BYTE_DATA(res)
10373 + kind * PyUnicode_GET_LENGTH(res));
10374 }
10375 else {
10376 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10377 Py_ssize_t itemlen;
10378 item = items[i];
10379
10380 /* Copy item, and maybe the separator. */
10381 if (i && seplen != 0) {
10382 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10383 res_offset += seplen;
10384 }
10385
10386 itemlen = PyUnicode_GET_LENGTH(item);
10387 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010388 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010389 res_offset += itemlen;
10390 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010391 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010392 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010393 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010396 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010401 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 return NULL;
10403}
10404
Victor Stinnerd3f08822012-05-29 12:57:52 +020010405void
10406_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10407 Py_UCS4 fill_char)
10408{
10409 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010410 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010411 assert(PyUnicode_IS_READY(unicode));
10412 assert(unicode_modifiable(unicode));
10413 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10414 assert(start >= 0);
10415 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010416 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010417}
10418
Victor Stinner3fe55312012-01-04 00:33:50 +010010419Py_ssize_t
10420PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10421 Py_UCS4 fill_char)
10422{
10423 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010424
10425 if (!PyUnicode_Check(unicode)) {
10426 PyErr_BadInternalCall();
10427 return -1;
10428 }
10429 if (PyUnicode_READY(unicode) == -1)
10430 return -1;
10431 if (unicode_check_modifiable(unicode))
10432 return -1;
10433
Victor Stinnerd3f08822012-05-29 12:57:52 +020010434 if (start < 0) {
10435 PyErr_SetString(PyExc_IndexError, "string index out of range");
10436 return -1;
10437 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010438 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10439 PyErr_SetString(PyExc_ValueError,
10440 "fill character is bigger than "
10441 "the string maximum character");
10442 return -1;
10443 }
10444
10445 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10446 length = Py_MIN(maxlen, length);
10447 if (length <= 0)
10448 return 0;
10449
Victor Stinnerd3f08822012-05-29 12:57:52 +020010450 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010451 return length;
10452}
10453
Victor Stinner9310abb2011-10-05 00:59:23 +020010454static PyObject *
10455pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010456 Py_ssize_t left,
10457 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 PyObject *u;
10461 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010462 int kind;
10463 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464
10465 if (left < 0)
10466 left = 0;
10467 if (right < 0)
10468 right = 0;
10469
Victor Stinnerc4b49542011-12-11 22:44:26 +010010470 if (left == 0 && right == 0)
10471 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10474 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010475 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10476 return NULL;
10477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010479 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010481 if (!u)
10482 return NULL;
10483
10484 kind = PyUnicode_KIND(u);
10485 data = PyUnicode_DATA(u);
10486 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010487 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010488 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010489 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010490 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010491 assert(_PyUnicode_CheckConsistency(u, 1));
10492 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493}
10494
Alexander Belopolsky40018472011-02-26 01:02:56 +000010495PyObject *
10496PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010500 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
Benjamin Petersonead6b532011-12-20 17:23:42 -060010503 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010505 if (PyUnicode_IS_ASCII(string))
10506 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010507 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010508 PyUnicode_GET_LENGTH(string), keepends);
10509 else
10510 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010511 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010512 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 break;
10514 case PyUnicode_2BYTE_KIND:
10515 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010516 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 PyUnicode_GET_LENGTH(string), keepends);
10518 break;
10519 case PyUnicode_4BYTE_KIND:
10520 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010521 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 PyUnicode_GET_LENGTH(string), keepends);
10523 break;
10524 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010525 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528}
10529
Alexander Belopolsky40018472011-02-26 01:02:56 +000010530static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010531split(PyObject *self,
10532 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010533 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010535 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010536 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 Py_ssize_t len1, len2;
10538 PyObject* out;
10539
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010541 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (PyUnicode_READY(self) == -1)
10544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010547 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010549 if (PyUnicode_IS_ASCII(self))
10550 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010551 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 PyUnicode_GET_LENGTH(self), maxcount
10553 );
10554 else
10555 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010556 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 PyUnicode_GET_LENGTH(self), maxcount
10558 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 case PyUnicode_2BYTE_KIND:
10560 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010561 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 PyUnicode_GET_LENGTH(self), maxcount
10563 );
10564 case PyUnicode_4BYTE_KIND:
10565 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010566 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 PyUnicode_GET_LENGTH(self), maxcount
10568 );
10569 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010570 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 }
10572
10573 if (PyUnicode_READY(substring) == -1)
10574 return NULL;
10575
10576 kind1 = PyUnicode_KIND(self);
10577 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 len1 = PyUnicode_GET_LENGTH(self);
10579 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010580 if (kind1 < kind2 || len1 < len2) {
10581 out = PyList_New(1);
10582 if (out == NULL)
10583 return NULL;
10584 Py_INCREF(self);
10585 PyList_SET_ITEM(out, 0, self);
10586 return out;
10587 }
10588 buf1 = PyUnicode_DATA(self);
10589 buf2 = PyUnicode_DATA(substring);
10590 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010591 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010592 if (!buf2)
10593 return NULL;
10594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010596 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010598 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10599 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010600 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010601 else
10602 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010603 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 break;
10605 case PyUnicode_2BYTE_KIND:
10606 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010607 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 break;
10609 case PyUnicode_4BYTE_KIND:
10610 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010611 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 break;
10613 default:
10614 out = NULL;
10615 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010616 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010617 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010618 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620}
10621
Alexander Belopolsky40018472011-02-26 01:02:56 +000010622static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010623rsplit(PyObject *self,
10624 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010625 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010626{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010627 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010628 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 Py_ssize_t len1, len2;
10630 PyObject* out;
10631
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010632 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010633 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (PyUnicode_READY(self) == -1)
10636 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010639 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010641 if (PyUnicode_IS_ASCII(self))
10642 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010643 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010644 PyUnicode_GET_LENGTH(self), maxcount
10645 );
10646 else
10647 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010648 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010649 PyUnicode_GET_LENGTH(self), maxcount
10650 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 case PyUnicode_2BYTE_KIND:
10652 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010653 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 PyUnicode_GET_LENGTH(self), maxcount
10655 );
10656 case PyUnicode_4BYTE_KIND:
10657 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010658 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 PyUnicode_GET_LENGTH(self), maxcount
10660 );
10661 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010662 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 }
10664
10665 if (PyUnicode_READY(substring) == -1)
10666 return NULL;
10667
10668 kind1 = PyUnicode_KIND(self);
10669 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 len1 = PyUnicode_GET_LENGTH(self);
10671 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010672 if (kind1 < kind2 || len1 < len2) {
10673 out = PyList_New(1);
10674 if (out == NULL)
10675 return NULL;
10676 Py_INCREF(self);
10677 PyList_SET_ITEM(out, 0, self);
10678 return out;
10679 }
10680 buf1 = PyUnicode_DATA(self);
10681 buf2 = PyUnicode_DATA(substring);
10682 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010683 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010684 if (!buf2)
10685 return NULL;
10686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010688 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010690 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10691 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010692 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010693 else
10694 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010695 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 break;
10697 case PyUnicode_2BYTE_KIND:
10698 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010699 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 break;
10701 case PyUnicode_4BYTE_KIND:
10702 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010703 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 break;
10705 default:
10706 out = NULL;
10707 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010708 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010709 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010710 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 return out;
10712}
10713
10714static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010715anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10716 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010718 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010720 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10721 return asciilib_find(buf1, len1, buf2, len2, offset);
10722 else
10723 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 case PyUnicode_2BYTE_KIND:
10725 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10726 case PyUnicode_4BYTE_KIND:
10727 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10728 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010729 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730}
10731
10732static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010733anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10734 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010736 switch (kind) {
10737 case PyUnicode_1BYTE_KIND:
10738 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10739 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10740 else
10741 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10742 case PyUnicode_2BYTE_KIND:
10743 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10744 case PyUnicode_4BYTE_KIND:
10745 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10746 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010747 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010748}
10749
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010750static void
10751replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10752 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10753{
10754 int kind = PyUnicode_KIND(u);
10755 void *data = PyUnicode_DATA(u);
10756 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10757 if (kind == PyUnicode_1BYTE_KIND) {
10758 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10759 (Py_UCS1 *)data + len,
10760 u1, u2, maxcount);
10761 }
10762 else if (kind == PyUnicode_2BYTE_KIND) {
10763 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10764 (Py_UCS2 *)data + len,
10765 u1, u2, maxcount);
10766 }
10767 else {
10768 assert(kind == PyUnicode_4BYTE_KIND);
10769 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10770 (Py_UCS4 *)data + len,
10771 u1, u2, maxcount);
10772 }
10773}
10774
Alexander Belopolsky40018472011-02-26 01:02:56 +000010775static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776replace(PyObject *self, PyObject *str1,
10777 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010780 const char *sbuf = PyUnicode_DATA(self);
10781 const void *buf1 = PyUnicode_DATA(str1);
10782 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 int srelease = 0, release1 = 0, release2 = 0;
10784 int skind = PyUnicode_KIND(self);
10785 int kind1 = PyUnicode_KIND(str1);
10786 int kind2 = PyUnicode_KIND(str2);
10787 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10788 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10789 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010790 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010791 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010793 if (slen < len1)
10794 goto nothing;
10795
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010798 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010799 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800
Victor Stinner59de0ee2011-10-07 10:01:28 +020010801 if (str1 == str2)
10802 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803
Victor Stinner49a0a212011-10-12 23:46:10 +020010804 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010805 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10806 if (maxchar < maxchar_str1)
10807 /* substring too wide to be present */
10808 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010809 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10810 /* Replacing str1 with str2 may cause a maxchar reduction in the
10811 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010812 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010813 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010816 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010818 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010820 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010821 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010822 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010823
Victor Stinner69ed0f42013-04-09 21:48:24 +020010824 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010825 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010826 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010827 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010828 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010830 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010832
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010833 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10834 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010835 }
10836 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 int rkind = skind;
10838 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010839 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (kind1 < rkind) {
10842 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010843 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (!buf1) goto error;
10845 release1 = 1;
10846 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010847 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010848 if (i < 0)
10849 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 if (rkind > kind2) {
10851 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010852 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 if (!buf2) goto error;
10854 release2 = 1;
10855 }
10856 else if (rkind < kind2) {
10857 /* widen self and buf1 */
10858 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010859 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010860 assert(buf1 != PyUnicode_DATA(str1));
10861 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010862 buf1 = PyUnicode_DATA(str1);
10863 release1 = 0;
10864 }
10865 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 if (!sbuf) goto error;
10867 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010868 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 if (!buf1) goto error;
10870 release1 = 1;
10871 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010872 u = PyUnicode_New(slen, maxchar);
10873 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010875 assert(PyUnicode_KIND(u) == rkind);
10876 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010877
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010878 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010879 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010880 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010882 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010884
10885 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010886 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010887 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010888 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010889 if (i == -1)
10890 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010891 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010893 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010897 }
10898 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010900 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 int rkind = skind;
10902 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010905 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010906 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 if (!buf1) goto error;
10908 release1 = 1;
10909 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010910 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911 if (n == 0)
10912 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010914 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010915 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (!buf2) goto error;
10917 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010920 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010922 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 if (!sbuf) goto error;
10924 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010925 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010926 assert(buf1 != PyUnicode_DATA(str1));
10927 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010928 buf1 = PyUnicode_DATA(str1);
10929 release1 = 0;
10930 }
10931 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 if (!buf1) goto error;
10933 release1 = 1;
10934 }
10935 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10936 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010937 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 PyErr_SetString(PyExc_OverflowError,
10939 "replace string is too long");
10940 goto error;
10941 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010942 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010943 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010944 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010945 goto done;
10946 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010947 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 PyErr_SetString(PyExc_OverflowError,
10949 "replace string is too long");
10950 goto error;
10951 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010952 u = PyUnicode_New(new_size, maxchar);
10953 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010955 assert(PyUnicode_KIND(u) == rkind);
10956 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 ires = i = 0;
10958 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010959 while (n-- > 0) {
10960 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010961 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010962 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010963 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010964 if (j == -1)
10965 break;
10966 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010967 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010968 memcpy(res + rkind * ires,
10969 sbuf + rkind * i,
10970 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010972 }
10973 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010975 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010977 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010981 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010983 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010984 memcpy(res + rkind * ires,
10985 sbuf + rkind * i,
10986 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010987 }
10988 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010989 /* interleave */
10990 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010991 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010993 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010995 if (--n <= 0)
10996 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010997 memcpy(res + rkind * ires,
10998 sbuf + rkind * i,
10999 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 ires++;
11001 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011002 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011003 memcpy(res + rkind * ires,
11004 sbuf + rkind * i,
11005 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011006 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011007 }
11008
11009 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011010 unicode_adjust_maxchar(&u);
11011 if (u == NULL)
11012 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011014
11015 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011016 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11017 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11018 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011020 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011022 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011024 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011025 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011029 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011030 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11031 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11032 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011034 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011036 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011038 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011039 return unicode_result_unchanged(self);
11040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011042 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11043 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11044 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11045 if (srelease)
11046 PyMem_FREE((void *)sbuf);
11047 if (release1)
11048 PyMem_FREE((void *)buf1);
11049 if (release2)
11050 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052}
11053
11054/* --- Unicode Object Methods --------------------------------------------- */
11055
INADA Naoki3ae20562017-01-16 20:41:20 +090011056/*[clinic input]
11057str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058
INADA Naoki3ae20562017-01-16 20:41:20 +090011059Return a version of the string where each word is titlecased.
11060
11061More specifically, words start with uppercased characters and all remaining
11062cased characters have lower case.
11063[clinic start generated code]*/
11064
11065static PyObject *
11066unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011067/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011069 if (PyUnicode_READY(self) == -1)
11070 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011071 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072}
11073
INADA Naoki3ae20562017-01-16 20:41:20 +090011074/*[clinic input]
11075str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076
INADA Naoki3ae20562017-01-16 20:41:20 +090011077Return a capitalized version of the string.
11078
11079More specifically, make the first character have upper case and the rest lower
11080case.
11081[clinic start generated code]*/
11082
11083static PyObject *
11084unicode_capitalize_impl(PyObject *self)
11085/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011087 if (PyUnicode_READY(self) == -1)
11088 return NULL;
11089 if (PyUnicode_GET_LENGTH(self) == 0)
11090 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011091 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092}
11093
INADA Naoki3ae20562017-01-16 20:41:20 +090011094/*[clinic input]
11095str.casefold as unicode_casefold
11096
11097Return a version of the string suitable for caseless comparisons.
11098[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011099
11100static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011101unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011102/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011103{
11104 if (PyUnicode_READY(self) == -1)
11105 return NULL;
11106 if (PyUnicode_IS_ASCII(self))
11107 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011108 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011109}
11110
11111
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011112/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011113
11114static int
11115convert_uc(PyObject *obj, void *addr)
11116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011118
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011119 if (!PyUnicode_Check(obj)) {
11120 PyErr_Format(PyExc_TypeError,
11121 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011122 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011123 return 0;
11124 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011125 if (PyUnicode_READY(obj) < 0)
11126 return 0;
11127 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011128 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011129 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011130 return 0;
11131 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011132 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011133 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011134}
11135
INADA Naoki3ae20562017-01-16 20:41:20 +090011136/*[clinic input]
11137str.center as unicode_center
11138
11139 width: Py_ssize_t
11140 fillchar: Py_UCS4 = ' '
11141 /
11142
11143Return a centered string of length width.
11144
11145Padding is done using the specified fill character (default is a space).
11146[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
11148static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011149unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11150/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011152 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153
Benjamin Petersonbac79492012-01-14 13:34:47 -050011154 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 return NULL;
11156
Victor Stinnerc4b49542011-12-11 22:44:26 +010011157 if (PyUnicode_GET_LENGTH(self) >= width)
11158 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
Victor Stinnerc4b49542011-12-11 22:44:26 +010011160 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161 left = marg / 2 + (marg & width & 1);
11162
Victor Stinner9310abb2011-10-05 00:59:23 +020011163 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164}
11165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166/* This function assumes that str1 and str2 are readied by the caller. */
11167
Marc-André Lemburge5034372000-08-08 08:04:29 +000011168static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011169unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011170{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011171#define COMPARE(TYPE1, TYPE2) \
11172 do { \
11173 TYPE1* p1 = (TYPE1 *)data1; \
11174 TYPE2* p2 = (TYPE2 *)data2; \
11175 TYPE1* end = p1 + len; \
11176 Py_UCS4 c1, c2; \
11177 for (; p1 != end; p1++, p2++) { \
11178 c1 = *p1; \
11179 c2 = *p2; \
11180 if (c1 != c2) \
11181 return (c1 < c2) ? -1 : 1; \
11182 } \
11183 } \
11184 while (0)
11185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011187 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011188 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 kind1 = PyUnicode_KIND(str1);
11191 kind2 = PyUnicode_KIND(str2);
11192 data1 = PyUnicode_DATA(str1);
11193 data2 = PyUnicode_DATA(str2);
11194 len1 = PyUnicode_GET_LENGTH(str1);
11195 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011196 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011197
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011198 switch(kind1) {
11199 case PyUnicode_1BYTE_KIND:
11200 {
11201 switch(kind2) {
11202 case PyUnicode_1BYTE_KIND:
11203 {
11204 int cmp = memcmp(data1, data2, len);
11205 /* normalize result of memcmp() into the range [-1; 1] */
11206 if (cmp < 0)
11207 return -1;
11208 if (cmp > 0)
11209 return 1;
11210 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011211 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011212 case PyUnicode_2BYTE_KIND:
11213 COMPARE(Py_UCS1, Py_UCS2);
11214 break;
11215 case PyUnicode_4BYTE_KIND:
11216 COMPARE(Py_UCS1, Py_UCS4);
11217 break;
11218 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011219 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011220 }
11221 break;
11222 }
11223 case PyUnicode_2BYTE_KIND:
11224 {
11225 switch(kind2) {
11226 case PyUnicode_1BYTE_KIND:
11227 COMPARE(Py_UCS2, Py_UCS1);
11228 break;
11229 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011230 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011231 COMPARE(Py_UCS2, Py_UCS2);
11232 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011233 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011234 case PyUnicode_4BYTE_KIND:
11235 COMPARE(Py_UCS2, Py_UCS4);
11236 break;
11237 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011238 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011239 }
11240 break;
11241 }
11242 case PyUnicode_4BYTE_KIND:
11243 {
11244 switch(kind2) {
11245 case PyUnicode_1BYTE_KIND:
11246 COMPARE(Py_UCS4, Py_UCS1);
11247 break;
11248 case PyUnicode_2BYTE_KIND:
11249 COMPARE(Py_UCS4, Py_UCS2);
11250 break;
11251 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011252 {
11253#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11254 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11255 /* normalize result of wmemcmp() into the range [-1; 1] */
11256 if (cmp < 0)
11257 return -1;
11258 if (cmp > 0)
11259 return 1;
11260#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011261 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011262#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011263 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011264 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011265 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011266 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011267 }
11268 break;
11269 }
11270 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011271 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011272 }
11273
Victor Stinner770e19e2012-10-04 22:59:45 +020011274 if (len1 == len2)
11275 return 0;
11276 if (len1 < len2)
11277 return -1;
11278 else
11279 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011280
11281#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011282}
11283
Benjamin Peterson621b4302016-09-09 13:54:34 -070011284static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011285unicode_compare_eq(PyObject *str1, PyObject *str2)
11286{
11287 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011288 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011289 Py_ssize_t len;
11290 int cmp;
11291
Victor Stinnere5567ad2012-10-23 02:48:49 +020011292 len = PyUnicode_GET_LENGTH(str1);
11293 if (PyUnicode_GET_LENGTH(str2) != len)
11294 return 0;
11295 kind = PyUnicode_KIND(str1);
11296 if (PyUnicode_KIND(str2) != kind)
11297 return 0;
11298 data1 = PyUnicode_DATA(str1);
11299 data2 = PyUnicode_DATA(str2);
11300
11301 cmp = memcmp(data1, data2, len * kind);
11302 return (cmp == 0);
11303}
11304
11305
Alexander Belopolsky40018472011-02-26 01:02:56 +000011306int
11307PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11310 if (PyUnicode_READY(left) == -1 ||
11311 PyUnicode_READY(right) == -1)
11312 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011313
11314 /* a string is equal to itself */
11315 if (left == right)
11316 return 0;
11317
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011318 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011320 PyErr_Format(PyExc_TypeError,
11321 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011322 Py_TYPE(left)->tp_name,
11323 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 return -1;
11325}
11326
Martin v. Löwis5b222132007-06-10 09:51:05 +000011327int
11328PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 Py_ssize_t i;
11331 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011333 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334
Victor Stinner910337b2011-10-03 03:20:16 +020011335 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011336 if (!PyUnicode_IS_READY(uni)) {
11337 const wchar_t *ws = _PyUnicode_WSTR(uni);
11338 /* Compare Unicode string and source character set string */
11339 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11340 if (chr != ustr[i])
11341 return (chr < ustr[i]) ? -1 : 1;
11342 }
11343 /* This check keeps Python strings that end in '\0' from comparing equal
11344 to C strings identical up to that point. */
11345 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11346 return 1; /* uni is longer */
11347 if (ustr[i])
11348 return -1; /* str is longer */
11349 return 0;
11350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011352 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011353 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011354 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011355 size_t len, len2 = strlen(str);
11356 int cmp;
11357
11358 len = Py_MIN(len1, len2);
11359 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011360 if (cmp != 0) {
11361 if (cmp < 0)
11362 return -1;
11363 else
11364 return 1;
11365 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011366 if (len1 > len2)
11367 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011368 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011369 return -1; /* str is longer */
11370 return 0;
11371 }
11372 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011373 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011374 /* Compare Unicode string and source character set string */
11375 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011376 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011377 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11378 /* This check keeps Python strings that end in '\0' from comparing equal
11379 to C strings identical up to that point. */
11380 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11381 return 1; /* uni is longer */
11382 if (str[i])
11383 return -1; /* str is longer */
11384 return 0;
11385 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011386}
11387
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011388static int
11389non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11390{
11391 size_t i, len;
11392 const wchar_t *p;
11393 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11394 if (strlen(str) != len)
11395 return 0;
11396 p = _PyUnicode_WSTR(unicode);
11397 assert(p);
11398 for (i = 0; i < len; i++) {
11399 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011400 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011401 return 0;
11402 }
11403 return 1;
11404}
11405
11406int
11407_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11408{
11409 size_t len;
11410 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011411 assert(str);
11412#ifndef NDEBUG
11413 for (const char *p = str; *p; p++) {
11414 assert((unsigned char)*p < 128);
11415 }
11416#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011417 if (PyUnicode_READY(unicode) == -1) {
11418 /* Memory error or bad data */
11419 PyErr_Clear();
11420 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11421 }
11422 if (!PyUnicode_IS_ASCII(unicode))
11423 return 0;
11424 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11425 return strlen(str) == len &&
11426 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11427}
11428
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011429int
11430_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11431{
11432 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011433
11434 assert(_PyUnicode_CHECK(left));
11435 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011436#ifndef NDEBUG
11437 for (const char *p = right->string; *p; p++) {
11438 assert((unsigned char)*p < 128);
11439 }
11440#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011441
11442 if (PyUnicode_READY(left) == -1) {
11443 /* memory error or bad data */
11444 PyErr_Clear();
11445 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11446 }
11447
11448 if (!PyUnicode_IS_ASCII(left))
11449 return 0;
11450
11451 right_uni = _PyUnicode_FromId(right); /* borrowed */
11452 if (right_uni == NULL) {
11453 /* memory error or bad data */
11454 PyErr_Clear();
11455 return _PyUnicode_EqualToASCIIString(left, right->string);
11456 }
11457
11458 if (left == right_uni)
11459 return 1;
11460
11461 if (PyUnicode_CHECK_INTERNED(left))
11462 return 0;
11463
Victor Stinner607b1022020-05-05 18:50:30 +020011464#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011465 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011466 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011467 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11468 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011469#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011470
11471 return unicode_compare_eq(left, right_uni);
11472}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011473
Alexander Belopolsky40018472011-02-26 01:02:56 +000011474PyObject *
11475PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011476{
11477 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011478
Victor Stinnere5567ad2012-10-23 02:48:49 +020011479 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11480 Py_RETURN_NOTIMPLEMENTED;
11481
11482 if (PyUnicode_READY(left) == -1 ||
11483 PyUnicode_READY(right) == -1)
11484 return NULL;
11485
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011486 if (left == right) {
11487 switch (op) {
11488 case Py_EQ:
11489 case Py_LE:
11490 case Py_GE:
11491 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011492 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011493 case Py_NE:
11494 case Py_LT:
11495 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011496 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011497 default:
11498 PyErr_BadArgument();
11499 return NULL;
11500 }
11501 }
11502 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011503 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011504 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011505 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011506 }
11507 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011508 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011509 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011510 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011511}
11512
Alexander Belopolsky40018472011-02-26 01:02:56 +000011513int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011514_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11515{
11516 return unicode_eq(aa, bb);
11517}
11518
11519int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011520PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011521{
Victor Stinner77282cb2013-04-14 19:22:47 +020011522 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011523 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011525 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011526
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011527 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011528 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011529 "'in <string>' requires string as left operand, not %.100s",
11530 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011531 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011532 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011533 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011534 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011535 if (ensure_unicode(str) < 0)
11536 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011539 kind2 = PyUnicode_KIND(substr);
11540 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011541 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011543 len2 = PyUnicode_GET_LENGTH(substr);
11544 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011545 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011546 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011547 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011548 if (len2 == 1) {
11549 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11550 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011551 return result;
11552 }
11553 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011554 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011555 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011556 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558
Victor Stinner77282cb2013-04-14 19:22:47 +020011559 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 case PyUnicode_1BYTE_KIND:
11561 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11562 break;
11563 case PyUnicode_2BYTE_KIND:
11564 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11565 break;
11566 case PyUnicode_4BYTE_KIND:
11567 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11568 break;
11569 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011570 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011572
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011573 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011574 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011575 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576
Guido van Rossum403d68b2000-03-13 15:55:09 +000011577 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011578}
11579
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580/* Concat to string or Unicode object giving a new Unicode object. */
11581
Alexander Belopolsky40018472011-02-26 01:02:56 +000011582PyObject *
11583PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011585 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011586 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011587 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011589 if (ensure_unicode(left) < 0)
11590 return NULL;
11591
11592 if (!PyUnicode_Check(right)) {
11593 PyErr_Format(PyExc_TypeError,
11594 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011595 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011596 return NULL;
11597 }
11598 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
11601 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011602 PyObject *empty = unicode_get_empty(); // Borrowed reference
11603 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011604 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011605 }
11606 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011607 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011610 left_len = PyUnicode_GET_LENGTH(left);
11611 right_len = PyUnicode_GET_LENGTH(right);
11612 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011613 PyErr_SetString(PyExc_OverflowError,
11614 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011615 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011616 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011617 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011618
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011619 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11620 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011621 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011624 result = PyUnicode_New(new_len, maxchar);
11625 if (result == NULL)
11626 return NULL;
11627 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11628 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11629 assert(_PyUnicode_CheckConsistency(result, 1));
11630 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631}
11632
Walter Dörwald1ab83302007-05-18 17:15:44 +000011633void
Victor Stinner23e56682011-10-03 03:54:37 +020011634PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011635{
Victor Stinner23e56682011-10-03 03:54:37 +020011636 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011637 Py_UCS4 maxchar, maxchar2;
11638 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011639
11640 if (p_left == NULL) {
11641 if (!PyErr_Occurred())
11642 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011643 return;
11644 }
Victor Stinner23e56682011-10-03 03:54:37 +020011645 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011646 if (right == NULL || left == NULL
11647 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011648 if (!PyErr_Occurred())
11649 PyErr_BadInternalCall();
11650 goto error;
11651 }
11652
Benjamin Petersonbac79492012-01-14 13:34:47 -050011653 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011654 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011655 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011656 goto error;
11657
Victor Stinner488fa492011-12-12 00:01:39 +010011658 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011659 PyObject *empty = unicode_get_empty(); // Borrowed reference
11660 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011661 Py_DECREF(left);
11662 Py_INCREF(right);
11663 *p_left = right;
11664 return;
11665 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011666 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011667 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011668 }
Victor Stinner488fa492011-12-12 00:01:39 +010011669
11670 left_len = PyUnicode_GET_LENGTH(left);
11671 right_len = PyUnicode_GET_LENGTH(right);
11672 if (left_len > PY_SSIZE_T_MAX - right_len) {
11673 PyErr_SetString(PyExc_OverflowError,
11674 "strings are too large to concat");
11675 goto error;
11676 }
11677 new_len = left_len + right_len;
11678
11679 if (unicode_modifiable(left)
11680 && PyUnicode_CheckExact(right)
11681 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011682 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11683 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011684 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011685 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011686 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11687 {
11688 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011689 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011690 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011691
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011692 /* copy 'right' into the newly allocated area of 'left' */
11693 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011694 }
Victor Stinner488fa492011-12-12 00:01:39 +010011695 else {
11696 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11697 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011698 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011699
Victor Stinner488fa492011-12-12 00:01:39 +010011700 /* Concat the two Unicode strings */
11701 res = PyUnicode_New(new_len, maxchar);
11702 if (res == NULL)
11703 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011704 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11705 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011706 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011707 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011708 }
11709 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011710 return;
11711
11712error:
Victor Stinner488fa492011-12-12 00:01:39 +010011713 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011714}
11715
11716void
11717PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11718{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011719 PyUnicode_Append(pleft, right);
11720 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011721}
11722
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011723/*
11724Wraps stringlib_parse_args_finds() and additionally ensures that the
11725first argument is a unicode object.
11726*/
11727
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011728static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011729parse_args_finds_unicode(const char * function_name, PyObject *args,
11730 PyObject **substring,
11731 Py_ssize_t *start, Py_ssize_t *end)
11732{
11733 if(stringlib_parse_args_finds(function_name, args, substring,
11734 start, end)) {
11735 if (ensure_unicode(*substring) < 0)
11736 return 0;
11737 return 1;
11738 }
11739 return 0;
11740}
11741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011742PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011745Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011746string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011747interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
11749static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011750unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011752 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011753 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011754 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011756 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011757 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011760 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 kind1 = PyUnicode_KIND(self);
11764 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011765 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011766 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 len1 = PyUnicode_GET_LENGTH(self);
11769 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011771 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011772 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011773
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011774 buf1 = PyUnicode_DATA(self);
11775 buf2 = PyUnicode_DATA(substring);
11776 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011777 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011778 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011779 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011780 }
11781 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 case PyUnicode_1BYTE_KIND:
11783 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011784 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 buf2, len2, PY_SSIZE_T_MAX
11786 );
11787 break;
11788 case PyUnicode_2BYTE_KIND:
11789 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011790 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 buf2, len2, PY_SSIZE_T_MAX
11792 );
11793 break;
11794 case PyUnicode_4BYTE_KIND:
11795 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011796 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 buf2, len2, PY_SSIZE_T_MAX
11798 );
11799 break;
11800 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011801 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 }
11803
11804 result = PyLong_FromSsize_t(iresult);
11805
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011806 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011807 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011808 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810 return result;
11811}
11812
INADA Naoki3ae20562017-01-16 20:41:20 +090011813/*[clinic input]
11814str.encode as unicode_encode
11815
11816 encoding: str(c_default="NULL") = 'utf-8'
11817 The encoding in which to encode the string.
11818 errors: str(c_default="NULL") = 'strict'
11819 The error handling scheme to use for encoding errors.
11820 The default is 'strict' meaning that encoding errors raise a
11821 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11822 'xmlcharrefreplace' as well as any other name registered with
11823 codecs.register_error that can handle UnicodeEncodeErrors.
11824
11825Encode the string using the codec registered for encoding.
11826[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
11828static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011829unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011830/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011832 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011833}
11834
INADA Naoki3ae20562017-01-16 20:41:20 +090011835/*[clinic input]
11836str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837
INADA Naoki3ae20562017-01-16 20:41:20 +090011838 tabsize: int = 8
11839
11840Return a copy where all tab characters are expanded using spaces.
11841
11842If tabsize is not given, a tab size of 8 characters is assumed.
11843[clinic start generated code]*/
11844
11845static PyObject *
11846unicode_expandtabs_impl(PyObject *self, int tabsize)
11847/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011849 Py_ssize_t i, j, line_pos, src_len, incr;
11850 Py_UCS4 ch;
11851 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011852 const void *src_data;
11853 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011854 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011855 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856
Antoine Pitrou22425222011-10-04 19:10:51 +020011857 if (PyUnicode_READY(self) == -1)
11858 return NULL;
11859
Thomas Wouters7e474022000-07-16 12:04:32 +000011860 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011861 src_len = PyUnicode_GET_LENGTH(self);
11862 i = j = line_pos = 0;
11863 kind = PyUnicode_KIND(self);
11864 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011865 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011866 for (; i < src_len; i++) {
11867 ch = PyUnicode_READ(kind, src_data, i);
11868 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011869 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011871 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011873 goto overflow;
11874 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011876 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011880 goto overflow;
11881 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011883 if (ch == '\n' || ch == '\r')
11884 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011886 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011887 if (!found)
11888 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011889
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011891 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 if (!u)
11893 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011894 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Antoine Pitroue71d5742011-10-04 15:55:09 +020011896 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Antoine Pitroue71d5742011-10-04 15:55:09 +020011898 for (; i < src_len; i++) {
11899 ch = PyUnicode_READ(kind, src_data, i);
11900 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011902 incr = tabsize - (line_pos % tabsize);
11903 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011904 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011905 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011907 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011909 line_pos++;
11910 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011911 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011912 if (ch == '\n' || ch == '\r')
11913 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011915 }
11916 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011917 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011918
Antoine Pitroue71d5742011-10-04 15:55:09 +020011919 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011920 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922}
11923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011924PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926\n\
11927Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011928such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929arguments start and end are interpreted as in slice notation.\n\
11930\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011931Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932
11933static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011936 /* initialize variables to prevent gcc warning */
11937 PyObject *substring = NULL;
11938 Py_ssize_t start = 0;
11939 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011940 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011942 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011945 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011948 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 if (result == -2)
11951 return NULL;
11952
Christian Heimes217cfd12007-12-02 14:31:20 +000011953 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954}
11955
11956static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011957unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011959 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011960 enum PyUnicode_Kind kind;
11961 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011962
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011963 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011964 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011966 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011967 if (PyUnicode_READY(self) == -1) {
11968 return NULL;
11969 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011970 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11971 PyErr_SetString(PyExc_IndexError, "string index out of range");
11972 return NULL;
11973 }
11974 kind = PyUnicode_KIND(self);
11975 data = PyUnicode_DATA(self);
11976 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011977 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978}
11979
Guido van Rossumc2504932007-09-18 19:42:40 +000011980/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011981 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011982static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011983unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011985 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011986
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011987#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011988 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011989#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if (_PyUnicode_HASH(self) != -1)
11991 return _PyUnicode_HASH(self);
11992 if (PyUnicode_READY(self) == -1)
11993 return -1;
animalizea1d14252019-01-02 20:16:06 +080011994
Christian Heimes985ecdc2013-11-20 11:46:18 +010011995 x = _Py_HashBytes(PyUnicode_DATA(self),
11996 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011998 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999}
12000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012001PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003\n\
oldkaa0735f2018-02-02 16:52:55 +080012004Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012005such that sub is contained within S[start:end]. Optional\n\
12006arguments start and end are interpreted as in slice notation.\n\
12007\n\
12008Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
12010static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012013 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012014 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012015 PyObject *substring = NULL;
12016 Py_ssize_t start = 0;
12017 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012019 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012022 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012025 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (result == -2)
12028 return NULL;
12029
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 if (result < 0) {
12031 PyErr_SetString(PyExc_ValueError, "substring not found");
12032 return NULL;
12033 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012034
Christian Heimes217cfd12007-12-02 14:31:20 +000012035 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036}
12037
INADA Naoki3ae20562017-01-16 20:41:20 +090012038/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012039str.isascii as unicode_isascii
12040
12041Return True if all characters in the string are ASCII, False otherwise.
12042
12043ASCII characters have code points in the range U+0000-U+007F.
12044Empty string is ASCII too.
12045[clinic start generated code]*/
12046
12047static PyObject *
12048unicode_isascii_impl(PyObject *self)
12049/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12050{
12051 if (PyUnicode_READY(self) == -1) {
12052 return NULL;
12053 }
12054 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12055}
12056
12057/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012058str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059
INADA Naoki3ae20562017-01-16 20:41:20 +090012060Return True if the string is a lowercase string, False otherwise.
12061
12062A string is lowercase if all cased characters in the string are lowercase and
12063there is at least one cased character in the string.
12064[clinic start generated code]*/
12065
12066static PyObject *
12067unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012068/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 Py_ssize_t i, length;
12071 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012072 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 int cased;
12074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (PyUnicode_READY(self) == -1)
12076 return NULL;
12077 length = PyUnicode_GET_LENGTH(self);
12078 kind = PyUnicode_KIND(self);
12079 data = PyUnicode_DATA(self);
12080
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 if (length == 1)
12083 return PyBool_FromLong(
12084 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012086 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012088 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012089
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 for (i = 0; i < length; i++) {
12092 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012093
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012095 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 else if (!cased && Py_UNICODE_ISLOWER(ch))
12097 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012099 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100}
12101
INADA Naoki3ae20562017-01-16 20:41:20 +090012102/*[clinic input]
12103str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
INADA Naoki3ae20562017-01-16 20:41:20 +090012105Return True if the string is an uppercase string, False otherwise.
12106
12107A string is uppercase if all cased characters in the string are uppercase and
12108there is at least one cased character in the string.
12109[clinic start generated code]*/
12110
12111static PyObject *
12112unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012113/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 Py_ssize_t i, length;
12116 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012117 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118 int cased;
12119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 if (PyUnicode_READY(self) == -1)
12121 return NULL;
12122 length = PyUnicode_GET_LENGTH(self);
12123 kind = PyUnicode_KIND(self);
12124 data = PyUnicode_DATA(self);
12125
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (length == 1)
12128 return PyBool_FromLong(
12129 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012131 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012133 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012134
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 for (i = 0; i < length; i++) {
12137 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012138
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012140 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012141 else if (!cased && Py_UNICODE_ISUPPER(ch))
12142 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012144 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145}
12146
INADA Naoki3ae20562017-01-16 20:41:20 +090012147/*[clinic input]
12148str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149
INADA Naoki3ae20562017-01-16 20:41:20 +090012150Return True if the string is a title-cased string, False otherwise.
12151
12152In a title-cased string, upper- and title-case characters may only
12153follow uncased characters and lowercase characters only cased ones.
12154[clinic start generated code]*/
12155
12156static PyObject *
12157unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012158/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 Py_ssize_t i, length;
12161 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012162 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163 int cased, previous_is_cased;
12164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 if (PyUnicode_READY(self) == -1)
12166 return NULL;
12167 length = PyUnicode_GET_LENGTH(self);
12168 kind = PyUnicode_KIND(self);
12169 data = PyUnicode_DATA(self);
12170
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 if (length == 1) {
12173 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12174 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12175 (Py_UNICODE_ISUPPER(ch) != 0));
12176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012178 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012180 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012181
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182 cased = 0;
12183 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 for (i = 0; i < length; i++) {
12185 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012186
Benjamin Peterson29060642009-01-31 22:14:21 +000012187 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12188 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012189 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012190 previous_is_cased = 1;
12191 cased = 1;
12192 }
12193 else if (Py_UNICODE_ISLOWER(ch)) {
12194 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012195 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012196 previous_is_cased = 1;
12197 cased = 1;
12198 }
12199 else
12200 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012202 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203}
12204
INADA Naoki3ae20562017-01-16 20:41:20 +090012205/*[clinic input]
12206str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207
INADA Naoki3ae20562017-01-16 20:41:20 +090012208Return True if the string is a whitespace string, False otherwise.
12209
12210A string is whitespace if all characters in the string are whitespace and there
12211is at least one character in the string.
12212[clinic start generated code]*/
12213
12214static PyObject *
12215unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012216/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 Py_ssize_t i, length;
12219 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012220 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221
12222 if (PyUnicode_READY(self) == -1)
12223 return NULL;
12224 length = PyUnicode_GET_LENGTH(self);
12225 kind = PyUnicode_KIND(self);
12226 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (length == 1)
12230 return PyBool_FromLong(
12231 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012233 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012235 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 for (i = 0; i < length; i++) {
12238 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012239 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012240 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012242 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243}
12244
INADA Naoki3ae20562017-01-16 20:41:20 +090012245/*[clinic input]
12246str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012247
INADA Naoki3ae20562017-01-16 20:41:20 +090012248Return True if the string is an alphabetic string, False otherwise.
12249
12250A string is alphabetic if all characters in the string are alphabetic and there
12251is at least one character in the string.
12252[clinic start generated code]*/
12253
12254static PyObject *
12255unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012256/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 Py_ssize_t i, length;
12259 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012260 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261
12262 if (PyUnicode_READY(self) == -1)
12263 return NULL;
12264 length = PyUnicode_GET_LENGTH(self);
12265 kind = PyUnicode_KIND(self);
12266 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012267
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012268 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 if (length == 1)
12270 return PyBool_FromLong(
12271 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012272
12273 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012275 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 for (i = 0; i < length; i++) {
12278 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012279 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012280 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012281 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012282}
12283
INADA Naoki3ae20562017-01-16 20:41:20 +090012284/*[clinic input]
12285str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012286
INADA Naoki3ae20562017-01-16 20:41:20 +090012287Return True if the string is an alpha-numeric string, False otherwise.
12288
12289A string is alpha-numeric if all characters in the string are alpha-numeric and
12290there is at least one character in the string.
12291[clinic start generated code]*/
12292
12293static PyObject *
12294unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012295/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012298 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 Py_ssize_t len, i;
12300
12301 if (PyUnicode_READY(self) == -1)
12302 return NULL;
12303
12304 kind = PyUnicode_KIND(self);
12305 data = PyUnicode_DATA(self);
12306 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012307
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012308 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (len == 1) {
12310 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12311 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12312 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012313
12314 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012316 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 for (i = 0; i < len; i++) {
12319 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012320 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012321 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012322 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012323 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012324}
12325
INADA Naoki3ae20562017-01-16 20:41:20 +090012326/*[clinic input]
12327str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
INADA Naoki3ae20562017-01-16 20:41:20 +090012329Return True if the string is a decimal string, False otherwise.
12330
12331A string is a decimal string if all characters in the string are decimal and
12332there is at least one character in the string.
12333[clinic start generated code]*/
12334
12335static PyObject *
12336unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012337/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 Py_ssize_t i, length;
12340 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012341 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342
12343 if (PyUnicode_READY(self) == -1)
12344 return NULL;
12345 length = PyUnicode_GET_LENGTH(self);
12346 kind = PyUnicode_KIND(self);
12347 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 if (length == 1)
12351 return PyBool_FromLong(
12352 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012354 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012356 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 for (i = 0; i < length; i++) {
12359 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012360 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012362 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363}
12364
INADA Naoki3ae20562017-01-16 20:41:20 +090012365/*[clinic input]
12366str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367
INADA Naoki3ae20562017-01-16 20:41:20 +090012368Return True if the string is a digit string, False otherwise.
12369
12370A string is a digit string if all characters in the string are digits and there
12371is at least one character in the string.
12372[clinic start generated code]*/
12373
12374static PyObject *
12375unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012376/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 Py_ssize_t i, length;
12379 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012380 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381
12382 if (PyUnicode_READY(self) == -1)
12383 return NULL;
12384 length = PyUnicode_GET_LENGTH(self);
12385 kind = PyUnicode_KIND(self);
12386 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 if (length == 1) {
12390 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12391 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012394 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012396 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 for (i = 0; i < length; i++) {
12399 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012400 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012402 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012403}
12404
INADA Naoki3ae20562017-01-16 20:41:20 +090012405/*[clinic input]
12406str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407
INADA Naoki3ae20562017-01-16 20:41:20 +090012408Return True if the string is a numeric string, False otherwise.
12409
12410A string is numeric if all characters in the string are numeric and there is at
12411least one character in the string.
12412[clinic start generated code]*/
12413
12414static PyObject *
12415unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012416/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 Py_ssize_t i, length;
12419 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012420 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421
12422 if (PyUnicode_READY(self) == -1)
12423 return NULL;
12424 length = PyUnicode_GET_LENGTH(self);
12425 kind = PyUnicode_KIND(self);
12426 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 if (length == 1)
12430 return PyBool_FromLong(
12431 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012433 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012435 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 for (i = 0; i < length; i++) {
12438 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012439 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012441 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442}
12443
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012444Py_ssize_t
12445_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012448 if (PyUnicode_READY(self) == -1)
12449 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012450
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012451 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012452 if (len == 0) {
12453 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 }
12456
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012457 int kind = PyUnicode_KIND(self);
12458 const void *data = PyUnicode_DATA(self);
12459 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012460 /* PEP 3131 says that the first character must be in
12461 XID_Start and subsequent characters in XID_Continue,
12462 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012463 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012464 letters, digits, underscore). However, given the current
12465 definition of XID_Start and XID_Continue, it is sufficient
12466 to check just for these, except that _ must be allowed
12467 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012468 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012469 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012470 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012471
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012472 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012473 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012474 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012475 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012476 }
12477 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012478 return i;
12479}
12480
12481int
12482PyUnicode_IsIdentifier(PyObject *self)
12483{
12484 if (PyUnicode_IS_READY(self)) {
12485 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12486 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12487 /* an empty string is not a valid identifier */
12488 return len && i == len;
12489 }
12490 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012491_Py_COMP_DIAG_PUSH
12492_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012493 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012494 if (len == 0) {
12495 /* an empty string is not a valid identifier */
12496 return 0;
12497 }
12498
12499 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012500 Py_UCS4 ch = wstr[i++];
12501#if SIZEOF_WCHAR_T == 2
12502 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12503 && i < len
12504 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12505 {
12506 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12507 i++;
12508 }
12509#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012510 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12511 return 0;
12512 }
12513
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012514 while (i < len) {
12515 ch = wstr[i++];
12516#if SIZEOF_WCHAR_T == 2
12517 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12518 && i < len
12519 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12520 {
12521 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12522 i++;
12523 }
12524#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012525 if (!_PyUnicode_IsXidContinue(ch)) {
12526 return 0;
12527 }
12528 }
12529 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012530_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012531 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012532}
12533
INADA Naoki3ae20562017-01-16 20:41:20 +090012534/*[clinic input]
12535str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012536
INADA Naoki3ae20562017-01-16 20:41:20 +090012537Return True if the string is a valid Python identifier, False otherwise.
12538
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012539Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012540such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012541[clinic start generated code]*/
12542
12543static PyObject *
12544unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012545/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012546{
12547 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12548}
12549
INADA Naoki3ae20562017-01-16 20:41:20 +090012550/*[clinic input]
12551str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012552
INADA Naoki3ae20562017-01-16 20:41:20 +090012553Return True if the string is printable, False otherwise.
12554
12555A string is printable if all of its characters are considered printable in
12556repr() or if it is empty.
12557[clinic start generated code]*/
12558
12559static PyObject *
12560unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012561/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 Py_ssize_t i, length;
12564 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012565 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566
12567 if (PyUnicode_READY(self) == -1)
12568 return NULL;
12569 length = PyUnicode_GET_LENGTH(self);
12570 kind = PyUnicode_KIND(self);
12571 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012572
12573 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 if (length == 1)
12575 return PyBool_FromLong(
12576 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 for (i = 0; i < length; i++) {
12579 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012580 Py_RETURN_FALSE;
12581 }
12582 }
12583 Py_RETURN_TRUE;
12584}
12585
INADA Naoki3ae20562017-01-16 20:41:20 +090012586/*[clinic input]
12587str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588
INADA Naoki3ae20562017-01-16 20:41:20 +090012589 iterable: object
12590 /
12591
12592Concatenate any number of strings.
12593
Martin Panter91a88662017-01-24 00:30:06 +000012594The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012595The result is returned as a new string.
12596
12597Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12598[clinic start generated code]*/
12599
12600static PyObject *
12601unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012602/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603{
INADA Naoki3ae20562017-01-16 20:41:20 +090012604 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605}
12606
Martin v. Löwis18e16552006-02-15 17:27:45 +000012607static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012608unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 if (PyUnicode_READY(self) == -1)
12611 return -1;
12612 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613}
12614
INADA Naoki3ae20562017-01-16 20:41:20 +090012615/*[clinic input]
12616str.ljust as unicode_ljust
12617
12618 width: Py_ssize_t
12619 fillchar: Py_UCS4 = ' '
12620 /
12621
12622Return a left-justified string of length width.
12623
12624Padding is done using the specified fill character (default is a space).
12625[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012628unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12629/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012631 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633
Victor Stinnerc4b49542011-12-11 22:44:26 +010012634 if (PyUnicode_GET_LENGTH(self) >= width)
12635 return unicode_result_unchanged(self);
12636
12637 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638}
12639
INADA Naoki3ae20562017-01-16 20:41:20 +090012640/*[clinic input]
12641str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642
INADA Naoki3ae20562017-01-16 20:41:20 +090012643Return a copy of the string converted to lowercase.
12644[clinic start generated code]*/
12645
12646static PyObject *
12647unicode_lower_impl(PyObject *self)
12648/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012650 if (PyUnicode_READY(self) == -1)
12651 return NULL;
12652 if (PyUnicode_IS_ASCII(self))
12653 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012654 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655}
12656
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012657#define LEFTSTRIP 0
12658#define RIGHTSTRIP 1
12659#define BOTHSTRIP 2
12660
12661/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012662static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012663
INADA Naoki3ae20562017-01-16 20:41:20 +090012664#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012665
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012666/* externally visible for str.strip(unicode) */
12667PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012668_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012669{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012670 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 int kind;
12672 Py_ssize_t i, j, len;
12673 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012674 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12677 return NULL;
12678
12679 kind = PyUnicode_KIND(self);
12680 data = PyUnicode_DATA(self);
12681 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012682 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12684 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012685 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012686
Benjamin Peterson14339b62009-01-31 16:36:08 +000012687 i = 0;
12688 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012689 while (i < len) {
12690 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12691 if (!BLOOM(sepmask, ch))
12692 break;
12693 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12694 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 i++;
12696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012697 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012698
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 j = len;
12700 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012701 j--;
12702 while (j >= i) {
12703 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12704 if (!BLOOM(sepmask, ch))
12705 break;
12706 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12707 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012709 }
12710
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012712 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012713
Victor Stinner7931d9a2011-11-04 00:22:48 +010012714 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715}
12716
12717PyObject*
12718PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12719{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012720 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012722 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723
Victor Stinnerde636f32011-10-01 03:55:54 +020012724 if (PyUnicode_READY(self) == -1)
12725 return NULL;
12726
Victor Stinner684d5fd2012-05-03 02:32:34 +020012727 length = PyUnicode_GET_LENGTH(self);
12728 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012729
Victor Stinner684d5fd2012-05-03 02:32:34 +020012730 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012731 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732
Victor Stinnerde636f32011-10-01 03:55:54 +020012733 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012734 PyErr_SetString(PyExc_IndexError, "string index out of range");
12735 return NULL;
12736 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012737 if (start >= length || end < start)
12738 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012739
Victor Stinner684d5fd2012-05-03 02:32:34 +020012740 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012741 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012742 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012743 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012744 }
12745 else {
12746 kind = PyUnicode_KIND(self);
12747 data = PyUnicode_1BYTE_DATA(self);
12748 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012749 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012750 length);
12751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753
12754static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012755do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 Py_ssize_t len, i, j;
12758
12759 if (PyUnicode_READY(self) == -1)
12760 return NULL;
12761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012763
Victor Stinnercc7af722013-04-09 22:39:24 +020012764 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012765 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012766
12767 i = 0;
12768 if (striptype != RIGHTSTRIP) {
12769 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012770 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012771 if (!_Py_ascii_whitespace[ch])
12772 break;
12773 i++;
12774 }
12775 }
12776
12777 j = len;
12778 if (striptype != LEFTSTRIP) {
12779 j--;
12780 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012781 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012782 if (!_Py_ascii_whitespace[ch])
12783 break;
12784 j--;
12785 }
12786 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012787 }
12788 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012789 else {
12790 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012791 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012792
Victor Stinnercc7af722013-04-09 22:39:24 +020012793 i = 0;
12794 if (striptype != RIGHTSTRIP) {
12795 while (i < len) {
12796 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12797 if (!Py_UNICODE_ISSPACE(ch))
12798 break;
12799 i++;
12800 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012801 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012802
12803 j = len;
12804 if (striptype != LEFTSTRIP) {
12805 j--;
12806 while (j >= i) {
12807 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12808 if (!Py_UNICODE_ISSPACE(ch))
12809 break;
12810 j--;
12811 }
12812 j++;
12813 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012814 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012815
Victor Stinner7931d9a2011-11-04 00:22:48 +010012816 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817}
12818
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012819
12820static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012821do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012822{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012823 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012824 if (PyUnicode_Check(sep))
12825 return _PyUnicode_XStrip(self, striptype, sep);
12826 else {
12827 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 "%s arg must be None or str",
12829 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 return NULL;
12831 }
12832 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012833
Benjamin Peterson14339b62009-01-31 16:36:08 +000012834 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012835}
12836
12837
INADA Naoki3ae20562017-01-16 20:41:20 +090012838/*[clinic input]
12839str.strip as unicode_strip
12840
12841 chars: object = None
12842 /
12843
Zachary Ware09895c22019-10-09 16:09:00 -050012844Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012845
12846If chars is given and not None, remove characters in chars instead.
12847[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012848
12849static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012850unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012851/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012852{
INADA Naoki3ae20562017-01-16 20:41:20 +090012853 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012854}
12855
12856
INADA Naoki3ae20562017-01-16 20:41:20 +090012857/*[clinic input]
12858str.lstrip as unicode_lstrip
12859
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012860 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012861 /
12862
12863Return a copy of the string with leading whitespace removed.
12864
12865If chars is given and not None, remove characters in chars instead.
12866[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012867
12868static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012869unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012870/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012871{
INADA Naoki3ae20562017-01-16 20:41:20 +090012872 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012873}
12874
12875
INADA Naoki3ae20562017-01-16 20:41:20 +090012876/*[clinic input]
12877str.rstrip as unicode_rstrip
12878
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012879 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012880 /
12881
12882Return a copy of the string with trailing whitespace removed.
12883
12884If chars is given and not None, remove characters in chars instead.
12885[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012886
12887static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012888unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012889/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012890{
INADA Naoki3ae20562017-01-16 20:41:20 +090012891 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012892}
12893
12894
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012896unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012898 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900
Serhiy Storchaka05997252013-01-26 12:14:02 +020012901 if (len < 1)
12902 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903
Victor Stinnerc4b49542011-12-11 22:44:26 +010012904 /* no repeat, return original string */
12905 if (len == 1)
12906 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012907
Benjamin Petersonbac79492012-01-14 13:34:47 -050012908 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 return NULL;
12910
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012911 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012912 PyErr_SetString(PyExc_OverflowError,
12913 "repeated string is too long");
12914 return NULL;
12915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012917
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012918 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919 if (!u)
12920 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012921 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012924 int kind = PyUnicode_KIND(str);
12925 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012926 if (kind == PyUnicode_1BYTE_KIND) {
12927 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012928 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012929 }
12930 else if (kind == PyUnicode_2BYTE_KIND) {
12931 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012932 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012933 ucs2[n] = fill_char;
12934 } else {
12935 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12936 assert(kind == PyUnicode_4BYTE_KIND);
12937 for (n = 0; n < len; ++n)
12938 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 }
12941 else {
12942 /* number of characters copied this far */
12943 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012944 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012946 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012948 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012950 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012951 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953 }
12954
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012955 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012956 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957}
12958
Alexander Belopolsky40018472011-02-26 01:02:56 +000012959PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012960PyUnicode_Replace(PyObject *str,
12961 PyObject *substr,
12962 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012963 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012965 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12966 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012967 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012968 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969}
12970
INADA Naoki3ae20562017-01-16 20:41:20 +090012971/*[clinic input]
12972str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973
INADA Naoki3ae20562017-01-16 20:41:20 +090012974 old: unicode
12975 new: unicode
12976 count: Py_ssize_t = -1
12977 Maximum number of occurrences to replace.
12978 -1 (the default value) means replace all occurrences.
12979 /
12980
12981Return a copy with all occurrences of substring old replaced by new.
12982
12983If the optional argument count is given, only the first count occurrences are
12984replaced.
12985[clinic start generated code]*/
12986
12987static PyObject *
12988unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12989 Py_ssize_t count)
12990/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012992 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012993 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012994 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995}
12996
sweeneydea81849b2020-04-22 17:05:48 -040012997/*[clinic input]
12998str.removeprefix as unicode_removeprefix
12999
13000 prefix: unicode
13001 /
13002
13003Return a str with the given prefix string removed if present.
13004
13005If the string starts with the prefix string, return string[len(prefix):].
13006Otherwise, return a copy of the original string.
13007[clinic start generated code]*/
13008
13009static PyObject *
13010unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13011/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13012{
13013 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13014 if (match == -1) {
13015 return NULL;
13016 }
13017 if (match) {
13018 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13019 PyUnicode_GET_LENGTH(self));
13020 }
13021 return unicode_result_unchanged(self);
13022}
13023
13024/*[clinic input]
13025str.removesuffix as unicode_removesuffix
13026
13027 suffix: unicode
13028 /
13029
13030Return a str with the given suffix string removed if present.
13031
13032If the string ends with the suffix string and that suffix is not empty,
13033return string[:-len(suffix)]. Otherwise, return a copy of the original
13034string.
13035[clinic start generated code]*/
13036
13037static PyObject *
13038unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13039/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13040{
13041 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13042 if (match == -1) {
13043 return NULL;
13044 }
13045 if (match) {
13046 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13047 - PyUnicode_GET_LENGTH(suffix));
13048 }
13049 return unicode_result_unchanged(self);
13050}
13051
Alexander Belopolsky40018472011-02-26 01:02:56 +000013052static PyObject *
13053unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013055 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 Py_ssize_t isize;
13057 Py_ssize_t osize, squote, dquote, i, o;
13058 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013059 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013060 const void *idata;
13061 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013064 return NULL;
13065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 isize = PyUnicode_GET_LENGTH(unicode);
13067 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 /* Compute length of output, quote characters, and
13070 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013071 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 max = 127;
13073 squote = dquote = 0;
13074 ikind = PyUnicode_KIND(unicode);
13075 for (i = 0; i < isize; i++) {
13076 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013077 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013079 case '\'': squote++; break;
13080 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013082 incr = 2;
13083 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013084 default:
13085 /* Fast-path ASCII */
13086 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013087 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013088 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013089 ;
13090 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013093 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013095 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013097 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013099 if (osize > PY_SSIZE_T_MAX - incr) {
13100 PyErr_SetString(PyExc_OverflowError,
13101 "string is too long to generate repr");
13102 return NULL;
13103 }
13104 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 }
13106
13107 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013108 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013110 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 if (dquote)
13112 /* Both squote and dquote present. Use squote,
13113 and escape them */
13114 osize += squote;
13115 else
13116 quote = '"';
13117 }
Victor Stinner55c08782013-04-14 18:45:39 +020013118 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119
13120 repr = PyUnicode_New(osize, max);
13121 if (repr == NULL)
13122 return NULL;
13123 okind = PyUnicode_KIND(repr);
13124 odata = PyUnicode_DATA(repr);
13125
13126 PyUnicode_WRITE(okind, odata, 0, quote);
13127 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013128 if (unchanged) {
13129 _PyUnicode_FastCopyCharacters(repr, 1,
13130 unicode, 0,
13131 isize);
13132 }
13133 else {
13134 for (i = 0, o = 1; i < isize; i++) {
13135 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136
Victor Stinner55c08782013-04-14 18:45:39 +020013137 /* Escape quotes and backslashes */
13138 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013139 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013141 continue;
13142 }
13143
13144 /* Map special whitespace to '\t', \n', '\r' */
13145 if (ch == '\t') {
13146 PyUnicode_WRITE(okind, odata, o++, '\\');
13147 PyUnicode_WRITE(okind, odata, o++, 't');
13148 }
13149 else if (ch == '\n') {
13150 PyUnicode_WRITE(okind, odata, o++, '\\');
13151 PyUnicode_WRITE(okind, odata, o++, 'n');
13152 }
13153 else if (ch == '\r') {
13154 PyUnicode_WRITE(okind, odata, o++, '\\');
13155 PyUnicode_WRITE(okind, odata, o++, 'r');
13156 }
13157
13158 /* Map non-printable US ASCII to '\xhh' */
13159 else if (ch < ' ' || ch == 0x7F) {
13160 PyUnicode_WRITE(okind, odata, o++, '\\');
13161 PyUnicode_WRITE(okind, odata, o++, 'x');
13162 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13163 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13164 }
13165
13166 /* Copy ASCII characters as-is */
13167 else if (ch < 0x7F) {
13168 PyUnicode_WRITE(okind, odata, o++, ch);
13169 }
13170
13171 /* Non-ASCII characters */
13172 else {
13173 /* Map Unicode whitespace and control characters
13174 (categories Z* and C* except ASCII space)
13175 */
13176 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13177 PyUnicode_WRITE(okind, odata, o++, '\\');
13178 /* Map 8-bit characters to '\xhh' */
13179 if (ch <= 0xff) {
13180 PyUnicode_WRITE(okind, odata, o++, 'x');
13181 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13182 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13183 }
13184 /* Map 16-bit characters to '\uxxxx' */
13185 else if (ch <= 0xffff) {
13186 PyUnicode_WRITE(okind, odata, o++, 'u');
13187 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13188 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13189 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13190 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13191 }
13192 /* Map 21-bit characters to '\U00xxxxxx' */
13193 else {
13194 PyUnicode_WRITE(okind, odata, o++, 'U');
13195 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13196 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13197 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13198 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13199 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13200 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13201 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13202 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13203 }
13204 }
13205 /* Copy characters as-is */
13206 else {
13207 PyUnicode_WRITE(okind, odata, o++, ch);
13208 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013209 }
13210 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013213 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013214 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215}
13216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013217PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219\n\
13220Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013221such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222arguments start and end are interpreted as in slice notation.\n\
13223\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013224Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225
13226static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013229 /* initialize variables to prevent gcc warning */
13230 PyObject *substring = NULL;
13231 Py_ssize_t start = 0;
13232 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013233 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013235 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013237
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013238 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013239 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013240
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013241 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013243 if (result == -2)
13244 return NULL;
13245
Christian Heimes217cfd12007-12-02 14:31:20 +000013246 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247}
13248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013249PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013250 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013252Return the highest index in S where substring sub is found,\n\
13253such that sub is contained within S[start:end]. Optional\n\
13254arguments start and end are interpreted as in slice notation.\n\
13255\n\
13256Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257
13258static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013261 /* initialize variables to prevent gcc warning */
13262 PyObject *substring = NULL;
13263 Py_ssize_t start = 0;
13264 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013265 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013267 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013270 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013273 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 if (result == -2)
13276 return NULL;
13277
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278 if (result < 0) {
13279 PyErr_SetString(PyExc_ValueError, "substring not found");
13280 return NULL;
13281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282
Christian Heimes217cfd12007-12-02 14:31:20 +000013283 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284}
13285
INADA Naoki3ae20562017-01-16 20:41:20 +090013286/*[clinic input]
13287str.rjust as unicode_rjust
13288
13289 width: Py_ssize_t
13290 fillchar: Py_UCS4 = ' '
13291 /
13292
13293Return a right-justified string of length width.
13294
13295Padding is done using the specified fill character (default is a space).
13296[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297
13298static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013299unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13300/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013302 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303 return NULL;
13304
Victor Stinnerc4b49542011-12-11 22:44:26 +010013305 if (PyUnicode_GET_LENGTH(self) >= width)
13306 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307
Victor Stinnerc4b49542011-12-11 22:44:26 +010013308 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309}
13310
Alexander Belopolsky40018472011-02-26 01:02:56 +000013311PyObject *
13312PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013314 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013317 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318}
13319
INADA Naoki3ae20562017-01-16 20:41:20 +090013320/*[clinic input]
13321str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322
INADA Naoki3ae20562017-01-16 20:41:20 +090013323 sep: object = None
13324 The delimiter according which to split the string.
13325 None (the default value) means split according to any whitespace,
13326 and discard empty strings from the result.
13327 maxsplit: Py_ssize_t = -1
13328 Maximum number of splits to do.
13329 -1 (the default value) means no limit.
13330
13331Return a list of the words in the string, using sep as the delimiter string.
13332[clinic start generated code]*/
13333
13334static PyObject *
13335unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13336/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337{
INADA Naoki3ae20562017-01-16 20:41:20 +090013338 if (sep == Py_None)
13339 return split(self, NULL, maxsplit);
13340 if (PyUnicode_Check(sep))
13341 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013342
Victor Stinner998b8062018-09-12 00:23:25 +020013343 PyErr_Format(PyExc_TypeError,
13344 "must be str or None, not %.100s",
13345 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347}
13348
Thomas Wouters477c8d52006-05-27 19:21:47 +000013349PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013350PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013351{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013352 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013353 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013354 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013356
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013357 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013359
Victor Stinner14f8f022011-10-05 20:58:25 +020013360 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362 len1 = PyUnicode_GET_LENGTH(str_obj);
13363 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013364 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013365 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013366 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013367 }
13368 buf1 = PyUnicode_DATA(str_obj);
13369 buf2 = PyUnicode_DATA(sep_obj);
13370 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013371 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013372 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013373 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013375
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013376 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013377 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013378 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13379 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13380 else
13381 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013382 break;
13383 case PyUnicode_2BYTE_KIND:
13384 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13385 break;
13386 case PyUnicode_4BYTE_KIND:
13387 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13388 break;
13389 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013390 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013392
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013393 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013394 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013395 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013396
13397 return out;
13398}
13399
13400
13401PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013403{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013404 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013405 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013406 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013408
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013409 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013411
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013412 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013413 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013414 len1 = PyUnicode_GET_LENGTH(str_obj);
13415 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013416 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013417 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013418 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013419 }
13420 buf1 = PyUnicode_DATA(str_obj);
13421 buf2 = PyUnicode_DATA(sep_obj);
13422 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013423 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013424 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013425 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013428 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013429 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013430 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13431 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13432 else
13433 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013434 break;
13435 case PyUnicode_2BYTE_KIND:
13436 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13437 break;
13438 case PyUnicode_4BYTE_KIND:
13439 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13440 break;
13441 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013442 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013444
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013445 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013446 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013447 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013448
13449 return out;
13450}
13451
INADA Naoki3ae20562017-01-16 20:41:20 +090013452/*[clinic input]
13453str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013454
INADA Naoki3ae20562017-01-16 20:41:20 +090013455 sep: object
13456 /
13457
13458Partition the string into three parts using the given separator.
13459
13460This will search for the separator in the string. If the separator is found,
13461returns a 3-tuple containing the part before the separator, the separator
13462itself, and the part after it.
13463
13464If the separator is not found, returns a 3-tuple containing the original string
13465and two empty strings.
13466[clinic start generated code]*/
13467
13468static PyObject *
13469unicode_partition(PyObject *self, PyObject *sep)
13470/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013471{
INADA Naoki3ae20562017-01-16 20:41:20 +090013472 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013473}
13474
INADA Naoki3ae20562017-01-16 20:41:20 +090013475/*[clinic input]
13476str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013477
INADA Naoki3ae20562017-01-16 20:41:20 +090013478Partition the string into three parts using the given separator.
13479
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013480This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013481the separator is found, returns a 3-tuple containing the part before the
13482separator, the separator itself, and the part after it.
13483
13484If the separator is not found, returns a 3-tuple containing two empty strings
13485and the original string.
13486[clinic start generated code]*/
13487
13488static PyObject *
13489unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013490/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013491{
INADA Naoki3ae20562017-01-16 20:41:20 +090013492 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013493}
13494
Alexander Belopolsky40018472011-02-26 01:02:56 +000013495PyObject *
13496PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013497{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013498 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013499 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013500
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013501 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013502}
13503
INADA Naoki3ae20562017-01-16 20:41:20 +090013504/*[clinic input]
13505str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013506
INADA Naoki3ae20562017-01-16 20:41:20 +090013507Return a list of the words in the string, using sep as the delimiter string.
13508
13509Splits are done starting at the end of the string and working to the front.
13510[clinic start generated code]*/
13511
13512static PyObject *
13513unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13514/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013515{
INADA Naoki3ae20562017-01-16 20:41:20 +090013516 if (sep == Py_None)
13517 return rsplit(self, NULL, maxsplit);
13518 if (PyUnicode_Check(sep))
13519 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013520
Victor Stinner998b8062018-09-12 00:23:25 +020013521 PyErr_Format(PyExc_TypeError,
13522 "must be str or None, not %.100s",
13523 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013524 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013525}
13526
INADA Naoki3ae20562017-01-16 20:41:20 +090013527/*[clinic input]
13528str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013529
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013530 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013531
13532Return a list of the lines in the string, breaking at line boundaries.
13533
13534Line breaks are not included in the resulting list unless keepends is given and
13535true.
13536[clinic start generated code]*/
13537
13538static PyObject *
13539unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013540/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013542 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013543}
13544
13545static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013546PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013547{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013548 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013549}
13550
INADA Naoki3ae20562017-01-16 20:41:20 +090013551/*[clinic input]
13552str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013553
INADA Naoki3ae20562017-01-16 20:41:20 +090013554Convert uppercase characters to lowercase and lowercase characters to uppercase.
13555[clinic start generated code]*/
13556
13557static PyObject *
13558unicode_swapcase_impl(PyObject *self)
13559/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013560{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013561 if (PyUnicode_READY(self) == -1)
13562 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013563 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013564}
13565
Larry Hastings61272b72014-01-07 12:41:53 -080013566/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013567
Larry Hastings31826802013-10-19 00:09:25 -070013568@staticmethod
13569str.maketrans as unicode_maketrans
13570
13571 x: object
13572
13573 y: unicode=NULL
13574
13575 z: unicode=NULL
13576
13577 /
13578
13579Return a translation table usable for str.translate().
13580
13581If there is only one argument, it must be a dictionary mapping Unicode
13582ordinals (integers) or characters to Unicode ordinals, strings or None.
13583Character keys will be then converted to ordinals.
13584If there are two arguments, they must be strings of equal length, and
13585in the resulting dictionary, each character in x will be mapped to the
13586character at the same position in y. If there is a third argument, it
13587must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013588[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013589
Larry Hastings31826802013-10-19 00:09:25 -070013590static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013591unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013592/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013593{
Georg Brandlceee0772007-11-27 23:48:05 +000013594 PyObject *new = NULL, *key, *value;
13595 Py_ssize_t i = 0;
13596 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013597
Georg Brandlceee0772007-11-27 23:48:05 +000013598 new = PyDict_New();
13599 if (!new)
13600 return NULL;
13601 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013603 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604
Georg Brandlceee0772007-11-27 23:48:05 +000013605 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013606 if (!PyUnicode_Check(x)) {
13607 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13608 "be a string if there is a second argument");
13609 goto err;
13610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013612 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13613 "arguments must have equal length");
13614 goto err;
13615 }
13616 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 x_kind = PyUnicode_KIND(x);
13618 y_kind = PyUnicode_KIND(y);
13619 x_data = PyUnicode_DATA(x);
13620 y_data = PyUnicode_DATA(y);
13621 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13622 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013623 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013624 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013625 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013626 if (!value) {
13627 Py_DECREF(key);
13628 goto err;
13629 }
Georg Brandlceee0772007-11-27 23:48:05 +000013630 res = PyDict_SetItem(new, key, value);
13631 Py_DECREF(key);
13632 Py_DECREF(value);
13633 if (res < 0)
13634 goto err;
13635 }
13636 /* create entries for deleting chars in z */
13637 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013638 z_kind = PyUnicode_KIND(z);
13639 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013640 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013642 if (!key)
13643 goto err;
13644 res = PyDict_SetItem(new, key, Py_None);
13645 Py_DECREF(key);
13646 if (res < 0)
13647 goto err;
13648 }
13649 }
13650 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013651 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013652 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013653
Georg Brandlceee0772007-11-27 23:48:05 +000013654 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013655 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013656 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13657 "to maketrans it must be a dict");
13658 goto err;
13659 }
13660 /* copy entries into the new dict, converting string keys to int keys */
13661 while (PyDict_Next(x, &i, &key, &value)) {
13662 if (PyUnicode_Check(key)) {
13663 /* convert string keys to integer keys */
13664 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013665 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013666 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13667 "table must be of length 1");
13668 goto err;
13669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 kind = PyUnicode_KIND(key);
13671 data = PyUnicode_DATA(key);
13672 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013673 if (!newkey)
13674 goto err;
13675 res = PyDict_SetItem(new, newkey, value);
13676 Py_DECREF(newkey);
13677 if (res < 0)
13678 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013679 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013680 /* just keep integer keys */
13681 if (PyDict_SetItem(new, key, value) < 0)
13682 goto err;
13683 } else {
13684 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13685 "be strings or integers");
13686 goto err;
13687 }
13688 }
13689 }
13690 return new;
13691 err:
13692 Py_DECREF(new);
13693 return NULL;
13694}
13695
INADA Naoki3ae20562017-01-16 20:41:20 +090013696/*[clinic input]
13697str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698
INADA Naoki3ae20562017-01-16 20:41:20 +090013699 table: object
13700 Translation table, which must be a mapping of Unicode ordinals to
13701 Unicode ordinals, strings, or None.
13702 /
13703
13704Replace each character in the string using the given translation table.
13705
13706The table must implement lookup/indexing via __getitem__, for instance a
13707dictionary or list. If this operation raises LookupError, the character is
13708left untouched. Characters mapped to None are deleted.
13709[clinic start generated code]*/
13710
13711static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013712unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013713/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013714{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013715 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013716}
13717
INADA Naoki3ae20562017-01-16 20:41:20 +090013718/*[clinic input]
13719str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013720
INADA Naoki3ae20562017-01-16 20:41:20 +090013721Return a copy of the string converted to uppercase.
13722[clinic start generated code]*/
13723
13724static PyObject *
13725unicode_upper_impl(PyObject *self)
13726/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013727{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013728 if (PyUnicode_READY(self) == -1)
13729 return NULL;
13730 if (PyUnicode_IS_ASCII(self))
13731 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013732 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013733}
13734
INADA Naoki3ae20562017-01-16 20:41:20 +090013735/*[clinic input]
13736str.zfill as unicode_zfill
13737
13738 width: Py_ssize_t
13739 /
13740
13741Pad a numeric string with zeros on the left, to fill a field of the given width.
13742
13743The string is never truncated.
13744[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013745
13746static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013747unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013748/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013749{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013750 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013751 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013752 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013753 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013754 Py_UCS4 chr;
13755
Benjamin Petersonbac79492012-01-14 13:34:47 -050013756 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013758
Victor Stinnerc4b49542011-12-11 22:44:26 +010013759 if (PyUnicode_GET_LENGTH(self) >= width)
13760 return unicode_result_unchanged(self);
13761
13762 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763
13764 u = pad(self, fill, 0, '0');
13765
Walter Dörwald068325e2002-04-15 13:36:47 +000013766 if (u == NULL)
13767 return NULL;
13768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013769 kind = PyUnicode_KIND(u);
13770 data = PyUnicode_DATA(u);
13771 chr = PyUnicode_READ(kind, data, fill);
13772
13773 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013775 PyUnicode_WRITE(kind, data, 0, chr);
13776 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777 }
13778
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013779 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013780 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013781}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782
13783#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013784static PyObject *
13785unicode__decimal2ascii(PyObject *self)
13786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013788}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013789#endif
13790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013791PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013793\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013794Return True if S starts with the specified prefix, False otherwise.\n\
13795With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013796With optional end, stop comparing S at that position.\n\
13797prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798
13799static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013800unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013802{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013803 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013804 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013805 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013806 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013807 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808
Jesus Ceaac451502011-04-20 17:09:23 +020013809 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013810 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013811 if (PyTuple_Check(subobj)) {
13812 Py_ssize_t i;
13813 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013814 substring = PyTuple_GET_ITEM(subobj, i);
13815 if (!PyUnicode_Check(substring)) {
13816 PyErr_Format(PyExc_TypeError,
13817 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013818 "not %.100s",
13819 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013820 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013821 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013822 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013823 if (result == -1)
13824 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013825 if (result) {
13826 Py_RETURN_TRUE;
13827 }
13828 }
13829 /* nothing matched */
13830 Py_RETURN_FALSE;
13831 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013832 if (!PyUnicode_Check(subobj)) {
13833 PyErr_Format(PyExc_TypeError,
13834 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013835 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013836 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013837 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013838 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013839 if (result == -1)
13840 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013841 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842}
13843
13844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013845PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013848Return True if S ends with the specified suffix, False otherwise.\n\
13849With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013850With optional end, stop comparing S at that position.\n\
13851suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852
13853static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013854unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013855 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013856{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013857 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013858 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013859 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013860 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013861 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013862
Jesus Ceaac451502011-04-20 17:09:23 +020013863 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013864 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013865 if (PyTuple_Check(subobj)) {
13866 Py_ssize_t i;
13867 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013868 substring = PyTuple_GET_ITEM(subobj, i);
13869 if (!PyUnicode_Check(substring)) {
13870 PyErr_Format(PyExc_TypeError,
13871 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013872 "not %.100s",
13873 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013875 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013876 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013877 if (result == -1)
13878 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013879 if (result) {
13880 Py_RETURN_TRUE;
13881 }
13882 }
13883 Py_RETURN_FALSE;
13884 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013885 if (!PyUnicode_Check(subobj)) {
13886 PyErr_Format(PyExc_TypeError,
13887 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013888 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013889 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013890 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013891 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013892 if (result == -1)
13893 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013894 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895}
13896
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013897static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013898_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013899{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013900 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13901 writer->data = PyUnicode_DATA(writer->buffer);
13902
13903 if (!writer->readonly) {
13904 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013905 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013906 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013907 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013908 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13909 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13910 writer->kind = PyUnicode_WCHAR_KIND;
13911 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13912
Victor Stinner8f674cc2013-04-17 23:02:17 +020013913 /* Copy-on-write mode: set buffer size to 0 so
13914 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13915 * next write. */
13916 writer->size = 0;
13917 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013918}
13919
Victor Stinnerd3f08822012-05-29 12:57:52 +020013920void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013921_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013922{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013923 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013924
13925 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013926 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013927
13928 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13929 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13930 writer->kind = PyUnicode_WCHAR_KIND;
13931 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013932}
13933
Inada Naoki770847a2019-06-24 12:30:24 +090013934// Initialize _PyUnicodeWriter with initial buffer
13935static inline void
13936_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13937{
13938 memset(writer, 0, sizeof(*writer));
13939 writer->buffer = buffer;
13940 _PyUnicodeWriter_Update(writer);
13941 writer->min_length = writer->size;
13942}
13943
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944int
13945_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13946 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013947{
13948 Py_ssize_t newlen;
13949 PyObject *newbuffer;
13950
Victor Stinner2740e462016-09-06 16:58:36 -070013951 assert(maxchar <= MAX_UNICODE);
13952
Victor Stinnerca9381e2015-09-22 00:58:32 +020013953 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013954 assert((maxchar > writer->maxchar && length >= 0)
13955 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013956
Victor Stinner202fdca2012-05-07 12:47:02 +020013957 if (length > PY_SSIZE_T_MAX - writer->pos) {
13958 PyErr_NoMemory();
13959 return -1;
13960 }
13961 newlen = writer->pos + length;
13962
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013963 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013964
Victor Stinnerd3f08822012-05-29 12:57:52 +020013965 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013966 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013967 if (writer->overallocate
13968 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13969 /* overallocate to limit the number of realloc() */
13970 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013971 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013972 if (newlen < writer->min_length)
13973 newlen = writer->min_length;
13974
Victor Stinnerd3f08822012-05-29 12:57:52 +020013975 writer->buffer = PyUnicode_New(newlen, maxchar);
13976 if (writer->buffer == NULL)
13977 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013978 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013979 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013980 if (writer->overallocate
13981 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13982 /* overallocate to limit the number of realloc() */
13983 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013984 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013985 if (newlen < writer->min_length)
13986 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013987
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013988 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013989 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013990 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013991 newbuffer = PyUnicode_New(newlen, maxchar);
13992 if (newbuffer == NULL)
13993 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013994 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13995 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013996 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013997 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013998 }
13999 else {
14000 newbuffer = resize_compact(writer->buffer, newlen);
14001 if (newbuffer == NULL)
14002 return -1;
14003 }
14004 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014005 }
14006 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014007 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014008 newbuffer = PyUnicode_New(writer->size, maxchar);
14009 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014010 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014011 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14012 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014013 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014014 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014015 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014016 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014017
14018#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014019}
14020
Victor Stinnerca9381e2015-09-22 00:58:32 +020014021int
14022_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14023 enum PyUnicode_Kind kind)
14024{
14025 Py_UCS4 maxchar;
14026
14027 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14028 assert(writer->kind < kind);
14029
14030 switch (kind)
14031 {
14032 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14033 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14034 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14035 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014036 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014037 }
14038
14039 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14040}
14041
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014042static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014043_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014044{
Victor Stinner2740e462016-09-06 16:58:36 -070014045 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014046 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14047 return -1;
14048 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14049 writer->pos++;
14050 return 0;
14051}
14052
14053int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014054_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14055{
14056 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14057}
14058
14059int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014060_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14061{
14062 Py_UCS4 maxchar;
14063 Py_ssize_t len;
14064
14065 if (PyUnicode_READY(str) == -1)
14066 return -1;
14067 len = PyUnicode_GET_LENGTH(str);
14068 if (len == 0)
14069 return 0;
14070 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14071 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014072 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014073 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014074 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014075 Py_INCREF(str);
14076 writer->buffer = str;
14077 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014078 writer->pos += len;
14079 return 0;
14080 }
14081 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14082 return -1;
14083 }
14084 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14085 str, 0, len);
14086 writer->pos += len;
14087 return 0;
14088}
14089
Victor Stinnere215d962012-10-06 23:03:36 +020014090int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014091_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14092 Py_ssize_t start, Py_ssize_t end)
14093{
14094 Py_UCS4 maxchar;
14095 Py_ssize_t len;
14096
14097 if (PyUnicode_READY(str) == -1)
14098 return -1;
14099
14100 assert(0 <= start);
14101 assert(end <= PyUnicode_GET_LENGTH(str));
14102 assert(start <= end);
14103
14104 if (end == 0)
14105 return 0;
14106
14107 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14108 return _PyUnicodeWriter_WriteStr(writer, str);
14109
14110 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14111 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14112 else
14113 maxchar = writer->maxchar;
14114 len = end - start;
14115
14116 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14117 return -1;
14118
14119 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14120 str, start, len);
14121 writer->pos += len;
14122 return 0;
14123}
14124
14125int
Victor Stinner4a587072013-11-19 12:54:53 +010014126_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14127 const char *ascii, Py_ssize_t len)
14128{
14129 if (len == -1)
14130 len = strlen(ascii);
14131
Andy Lestere6be9b52020-02-11 20:28:35 -060014132 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014133
14134 if (writer->buffer == NULL && !writer->overallocate) {
14135 PyObject *str;
14136
14137 str = _PyUnicode_FromASCII(ascii, len);
14138 if (str == NULL)
14139 return -1;
14140
14141 writer->readonly = 1;
14142 writer->buffer = str;
14143 _PyUnicodeWriter_Update(writer);
14144 writer->pos += len;
14145 return 0;
14146 }
14147
14148 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14149 return -1;
14150
14151 switch (writer->kind)
14152 {
14153 case PyUnicode_1BYTE_KIND:
14154 {
14155 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14156 Py_UCS1 *data = writer->data;
14157
Christian Heimesf051e432016-09-13 20:22:02 +020014158 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014159 break;
14160 }
14161 case PyUnicode_2BYTE_KIND:
14162 {
14163 _PyUnicode_CONVERT_BYTES(
14164 Py_UCS1, Py_UCS2,
14165 ascii, ascii + len,
14166 (Py_UCS2 *)writer->data + writer->pos);
14167 break;
14168 }
14169 case PyUnicode_4BYTE_KIND:
14170 {
14171 _PyUnicode_CONVERT_BYTES(
14172 Py_UCS1, Py_UCS4,
14173 ascii, ascii + len,
14174 (Py_UCS4 *)writer->data + writer->pos);
14175 break;
14176 }
14177 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014178 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014179 }
14180
14181 writer->pos += len;
14182 return 0;
14183}
14184
14185int
14186_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14187 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014188{
14189 Py_UCS4 maxchar;
14190
Andy Lestere6be9b52020-02-11 20:28:35 -060014191 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014192 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14193 return -1;
14194 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14195 writer->pos += len;
14196 return 0;
14197}
14198
Victor Stinnerd3f08822012-05-29 12:57:52 +020014199PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014200_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014201{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014202 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014203
Victor Stinnerd3f08822012-05-29 12:57:52 +020014204 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014205 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014206 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014207 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014208
14209 str = writer->buffer;
14210 writer->buffer = NULL;
14211
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014212 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014213 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14214 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014215 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014216
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014217 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14218 PyObject *str2;
14219 str2 = resize_compact(str, writer->pos);
14220 if (str2 == NULL) {
14221 Py_DECREF(str);
14222 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014223 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014224 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014225 }
14226
Victor Stinner15a0bd32013-07-08 22:29:55 +020014227 assert(_PyUnicode_CheckConsistency(str, 1));
14228 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014229}
14230
Victor Stinnerd3f08822012-05-29 12:57:52 +020014231void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014232_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014233{
14234 Py_CLEAR(writer->buffer);
14235}
14236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014237#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014238
14239PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014240 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014241\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014242Return a formatted version of S, using substitutions from args and kwargs.\n\
14243The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014244
Eric Smith27bbca62010-11-04 17:06:58 +000014245PyDoc_STRVAR(format_map__doc__,
14246 "S.format_map(mapping) -> str\n\
14247\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014248Return a formatted version of S, using substitutions from mapping.\n\
14249The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014250
INADA Naoki3ae20562017-01-16 20:41:20 +090014251/*[clinic input]
14252str.__format__ as unicode___format__
14253
14254 format_spec: unicode
14255 /
14256
14257Return a formatted version of the string as described by format_spec.
14258[clinic start generated code]*/
14259
Eric Smith4a7d76d2008-05-30 18:10:19 +000014260static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014261unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014262/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014263{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014264 _PyUnicodeWriter writer;
14265 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014266
Victor Stinnerd3f08822012-05-29 12:57:52 +020014267 if (PyUnicode_READY(self) == -1)
14268 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014269 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014270 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14271 self, format_spec, 0,
14272 PyUnicode_GET_LENGTH(format_spec));
14273 if (ret == -1) {
14274 _PyUnicodeWriter_Dealloc(&writer);
14275 return NULL;
14276 }
14277 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014278}
14279
INADA Naoki3ae20562017-01-16 20:41:20 +090014280/*[clinic input]
14281str.__sizeof__ as unicode_sizeof
14282
14283Return the size of the string in memory, in bytes.
14284[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014285
14286static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014287unicode_sizeof_impl(PyObject *self)
14288/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014290 Py_ssize_t size;
14291
14292 /* If it's a compact object, account for base structure +
14293 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014294 if (PyUnicode_IS_COMPACT_ASCII(self))
14295 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14296 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014297 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014298 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 else {
14300 /* If it is a two-block object, account for base object, and
14301 for character block if present. */
14302 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014303 if (_PyUnicode_DATA_ANY(self))
14304 size += (PyUnicode_GET_LENGTH(self) + 1) *
14305 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014306 }
14307 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014308 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014309 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14310 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14311 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14312 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014313
14314 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014315}
14316
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014317static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014318unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014319{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014320 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014321 if (!copy)
14322 return NULL;
14323 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014324}
14325
Guido van Rossumd57fd912000-03-10 22:53:23 +000014326static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014327 UNICODE_ENCODE_METHODDEF
14328 UNICODE_REPLACE_METHODDEF
14329 UNICODE_SPLIT_METHODDEF
14330 UNICODE_RSPLIT_METHODDEF
14331 UNICODE_JOIN_METHODDEF
14332 UNICODE_CAPITALIZE_METHODDEF
14333 UNICODE_CASEFOLD_METHODDEF
14334 UNICODE_TITLE_METHODDEF
14335 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014336 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014337 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014338 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014339 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014340 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014341 UNICODE_LJUST_METHODDEF
14342 UNICODE_LOWER_METHODDEF
14343 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014344 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14345 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014346 UNICODE_RJUST_METHODDEF
14347 UNICODE_RSTRIP_METHODDEF
14348 UNICODE_RPARTITION_METHODDEF
14349 UNICODE_SPLITLINES_METHODDEF
14350 UNICODE_STRIP_METHODDEF
14351 UNICODE_SWAPCASE_METHODDEF
14352 UNICODE_TRANSLATE_METHODDEF
14353 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014354 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14355 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014356 UNICODE_REMOVEPREFIX_METHODDEF
14357 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014358 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014359 UNICODE_ISLOWER_METHODDEF
14360 UNICODE_ISUPPER_METHODDEF
14361 UNICODE_ISTITLE_METHODDEF
14362 UNICODE_ISSPACE_METHODDEF
14363 UNICODE_ISDECIMAL_METHODDEF
14364 UNICODE_ISDIGIT_METHODDEF
14365 UNICODE_ISNUMERIC_METHODDEF
14366 UNICODE_ISALPHA_METHODDEF
14367 UNICODE_ISALNUM_METHODDEF
14368 UNICODE_ISIDENTIFIER_METHODDEF
14369 UNICODE_ISPRINTABLE_METHODDEF
14370 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014371 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014372 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014373 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014374 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014375 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014376#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014377 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014378 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014379#endif
14380
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014381 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014382 {NULL, NULL}
14383};
14384
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014385static PyObject *
14386unicode_mod(PyObject *v, PyObject *w)
14387{
Brian Curtindfc80e32011-08-10 20:28:54 -050014388 if (!PyUnicode_Check(v))
14389 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014390 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014391}
14392
14393static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014394 0, /*nb_add*/
14395 0, /*nb_subtract*/
14396 0, /*nb_multiply*/
14397 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014398};
14399
Guido van Rossumd57fd912000-03-10 22:53:23 +000014400static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014401 (lenfunc) unicode_length, /* sq_length */
14402 PyUnicode_Concat, /* sq_concat */
14403 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14404 (ssizeargfunc) unicode_getitem, /* sq_item */
14405 0, /* sq_slice */
14406 0, /* sq_ass_item */
14407 0, /* sq_ass_slice */
14408 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014409};
14410
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014411static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014412unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014414 if (PyUnicode_READY(self) == -1)
14415 return NULL;
14416
Victor Stinnera15e2602020-04-08 02:01:56 +020014417 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014418 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014419 if (i == -1 && PyErr_Occurred())
14420 return NULL;
14421 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014422 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014423 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014424 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014425 Py_ssize_t start, stop, step, slicelength, i;
14426 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014427 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014428 const void *src_data;
14429 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014430 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014431 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014432
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014433 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014434 return NULL;
14435 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014436 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14437 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014438
14439 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014440 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014441 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014442 slicelength == PyUnicode_GET_LENGTH(self)) {
14443 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014444 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014445 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014446 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014447 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014448 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014449 src_kind = PyUnicode_KIND(self);
14450 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014451 if (!PyUnicode_IS_ASCII(self)) {
14452 kind_limit = kind_maxchar_limit(src_kind);
14453 max_char = 0;
14454 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14455 ch = PyUnicode_READ(src_kind, src_data, cur);
14456 if (ch > max_char) {
14457 max_char = ch;
14458 if (max_char >= kind_limit)
14459 break;
14460 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014461 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014462 }
Victor Stinner55c99112011-10-13 01:17:06 +020014463 else
14464 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014465 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014466 if (result == NULL)
14467 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014468 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014469 dest_data = PyUnicode_DATA(result);
14470
14471 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014472 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14473 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014474 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014475 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014476 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014477 } else {
14478 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14479 return NULL;
14480 }
14481}
14482
14483static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014484 (lenfunc)unicode_length, /* mp_length */
14485 (binaryfunc)unicode_subscript, /* mp_subscript */
14486 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014487};
14488
Guido van Rossumd57fd912000-03-10 22:53:23 +000014489
Guido van Rossumd57fd912000-03-10 22:53:23 +000014490/* Helpers for PyUnicode_Format() */
14491
Victor Stinnera47082312012-10-04 02:19:54 +020014492struct unicode_formatter_t {
14493 PyObject *args;
14494 int args_owned;
14495 Py_ssize_t arglen, argidx;
14496 PyObject *dict;
14497
14498 enum PyUnicode_Kind fmtkind;
14499 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014500 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014501 PyObject *fmtstr;
14502
14503 _PyUnicodeWriter writer;
14504};
14505
14506struct unicode_format_arg_t {
14507 Py_UCS4 ch;
14508 int flags;
14509 Py_ssize_t width;
14510 int prec;
14511 int sign;
14512};
14513
Guido van Rossumd57fd912000-03-10 22:53:23 +000014514static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014515unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014516{
Victor Stinnera47082312012-10-04 02:19:54 +020014517 Py_ssize_t argidx = ctx->argidx;
14518
14519 if (argidx < ctx->arglen) {
14520 ctx->argidx++;
14521 if (ctx->arglen < 0)
14522 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014523 else
Victor Stinnera47082312012-10-04 02:19:54 +020014524 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014525 }
14526 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014527 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014528 return NULL;
14529}
14530
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014531/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014532
Victor Stinnera47082312012-10-04 02:19:54 +020014533/* Format a float into the writer if the writer is not NULL, or into *p_output
14534 otherwise.
14535
14536 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014537static int
Victor Stinnera47082312012-10-04 02:19:54 +020014538formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14539 PyObject **p_output,
14540 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014541{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014542 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014543 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014544 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014545 int prec;
14546 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014547
Guido van Rossumd57fd912000-03-10 22:53:23 +000014548 x = PyFloat_AsDouble(v);
14549 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014550 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014551
Victor Stinnera47082312012-10-04 02:19:54 +020014552 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014553 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014554 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014555
Victor Stinnera47082312012-10-04 02:19:54 +020014556 if (arg->flags & F_ALT)
14557 dtoa_flags = Py_DTSF_ALT;
14558 else
14559 dtoa_flags = 0;
14560 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014561 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014562 return -1;
14563 len = strlen(p);
14564 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014565 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014566 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014567 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014568 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014569 }
14570 else
14571 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014572 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014573 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014574}
14575
Victor Stinnerd0880d52012-04-27 23:40:13 +020014576/* formatlong() emulates the format codes d, u, o, x and X, and
14577 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14578 * Python's regular ints.
14579 * Return value: a new PyUnicodeObject*, or NULL if error.
14580 * The output string is of the form
14581 * "-"? ("0x" | "0X")? digit+
14582 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14583 * set in flags. The case of hex digits will be correct,
14584 * There will be at least prec digits, zero-filled on the left if
14585 * necessary to get that many.
14586 * val object to be converted
14587 * flags bitmask of format flags; only F_ALT is looked at
14588 * prec minimum number of digits; 0-fill on left if needed
14589 * type a character in [duoxX]; u acts the same as d
14590 *
14591 * CAUTION: o, x and X conversions on regular ints can never
14592 * produce a '-' sign, but can for Python's unbounded ints.
14593 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014594PyObject *
14595_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014596{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014597 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014598 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014599 Py_ssize_t i;
14600 int sign; /* 1 if '-', else 0 */
14601 int len; /* number of characters */
14602 Py_ssize_t llen;
14603 int numdigits; /* len == numnondigits + numdigits */
14604 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014605
Victor Stinnerd0880d52012-04-27 23:40:13 +020014606 /* Avoid exceeding SSIZE_T_MAX */
14607 if (prec > INT_MAX-3) {
14608 PyErr_SetString(PyExc_OverflowError,
14609 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014610 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014611 }
14612
14613 assert(PyLong_Check(val));
14614
14615 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014616 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014617 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014618 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014619 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014620 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014621 /* int and int subclasses should print numerically when a numeric */
14622 /* format code is used (see issue18780) */
14623 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014624 break;
14625 case 'o':
14626 numnondigits = 2;
14627 result = PyNumber_ToBase(val, 8);
14628 break;
14629 case 'x':
14630 case 'X':
14631 numnondigits = 2;
14632 result = PyNumber_ToBase(val, 16);
14633 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014634 }
14635 if (!result)
14636 return NULL;
14637
14638 assert(unicode_modifiable(result));
14639 assert(PyUnicode_IS_READY(result));
14640 assert(PyUnicode_IS_ASCII(result));
14641
14642 /* To modify the string in-place, there can only be one reference. */
14643 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014644 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014645 PyErr_BadInternalCall();
14646 return NULL;
14647 }
14648 buf = PyUnicode_DATA(result);
14649 llen = PyUnicode_GET_LENGTH(result);
14650 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014651 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014652 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014653 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014654 return NULL;
14655 }
14656 len = (int)llen;
14657 sign = buf[0] == '-';
14658 numnondigits += sign;
14659 numdigits = len - numnondigits;
14660 assert(numdigits > 0);
14661
14662 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014663 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014664 (type == 'o' || type == 'x' || type == 'X'))) {
14665 assert(buf[sign] == '0');
14666 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14667 buf[sign+1] == 'o');
14668 numnondigits -= 2;
14669 buf += 2;
14670 len -= 2;
14671 if (sign)
14672 buf[0] = '-';
14673 assert(len == numnondigits + numdigits);
14674 assert(numdigits > 0);
14675 }
14676
14677 /* Fill with leading zeroes to meet minimum width. */
14678 if (prec > numdigits) {
14679 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14680 numnondigits + prec);
14681 char *b1;
14682 if (!r1) {
14683 Py_DECREF(result);
14684 return NULL;
14685 }
14686 b1 = PyBytes_AS_STRING(r1);
14687 for (i = 0; i < numnondigits; ++i)
14688 *b1++ = *buf++;
14689 for (i = 0; i < prec - numdigits; i++)
14690 *b1++ = '0';
14691 for (i = 0; i < numdigits; i++)
14692 *b1++ = *buf++;
14693 *b1 = '\0';
14694 Py_DECREF(result);
14695 result = r1;
14696 buf = PyBytes_AS_STRING(result);
14697 len = numnondigits + prec;
14698 }
14699
14700 /* Fix up case for hex conversions. */
14701 if (type == 'X') {
14702 /* Need to convert all lower case letters to upper case.
14703 and need to convert 0x to 0X (and -0x to -0X). */
14704 for (i = 0; i < len; i++)
14705 if (buf[i] >= 'a' && buf[i] <= 'x')
14706 buf[i] -= 'a'-'A';
14707 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014708 if (!PyUnicode_Check(result)
14709 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014710 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014711 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014712 Py_DECREF(result);
14713 result = unicode;
14714 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014715 else if (len != PyUnicode_GET_LENGTH(result)) {
14716 if (PyUnicode_Resize(&result, len) < 0)
14717 Py_CLEAR(result);
14718 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014719 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014720}
14721
Ethan Furmandf3ed242014-01-05 06:50:30 -080014722/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014723 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014724 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014725 * -1 and raise an exception on error */
14726static int
Victor Stinnera47082312012-10-04 02:19:54 +020014727mainformatlong(PyObject *v,
14728 struct unicode_format_arg_t *arg,
14729 PyObject **p_output,
14730 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014731{
14732 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014733 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014734
14735 if (!PyNumber_Check(v))
14736 goto wrongtype;
14737
Ethan Furman9ab74802014-03-21 06:38:46 -070014738 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014739 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014740 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014741 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014742 }
14743 else {
14744 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014745 }
14746 if (iobj == NULL ) {
14747 if (PyErr_ExceptionMatches(PyExc_TypeError))
14748 goto wrongtype;
14749 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014750 }
14751 assert(PyLong_Check(iobj));
14752 }
14753 else {
14754 iobj = v;
14755 Py_INCREF(iobj);
14756 }
14757
14758 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014759 && arg->width == -1 && arg->prec == -1
14760 && !(arg->flags & (F_SIGN | F_BLANK))
14761 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014762 {
14763 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014764 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014765 int base;
14766
Victor Stinnera47082312012-10-04 02:19:54 +020014767 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014768 {
14769 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014770 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014771 case 'd':
14772 case 'i':
14773 case 'u':
14774 base = 10;
14775 break;
14776 case 'o':
14777 base = 8;
14778 break;
14779 case 'x':
14780 case 'X':
14781 base = 16;
14782 break;
14783 }
14784
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014785 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14786 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014787 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014788 }
14789 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014790 return 1;
14791 }
14792
Ethan Furmanb95b5612015-01-23 20:05:18 -080014793 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014794 Py_DECREF(iobj);
14795 if (res == NULL)
14796 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014797 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014798 return 0;
14799
14800wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014801 switch(type)
14802 {
14803 case 'o':
14804 case 'x':
14805 case 'X':
14806 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014807 "%%%c format: an integer is required, "
14808 "not %.200s",
14809 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014810 break;
14811 default:
14812 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014813 "%%%c format: a number is required, "
14814 "not %.200s",
14815 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014816 break;
14817 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014818 return -1;
14819}
14820
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014821static Py_UCS4
14822formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014823{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014824 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014825 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014826 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014827 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014828 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014829 goto onError;
14830 }
14831 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014832 int overflow;
14833 long x = PyLong_AsLongAndOverflow(v, &overflow);
14834 if (x == -1 && PyErr_Occurred()) {
14835 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014836 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014837 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014838 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014839 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014840
Victor Stinner8faf8212011-12-08 22:14:11 +010014841 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014842 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014843 PyErr_SetString(PyExc_OverflowError,
14844 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014845 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014846 }
14847
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014848 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014849 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014850
Benjamin Peterson29060642009-01-31 22:14:21 +000014851 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014852 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014853 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014854 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014855}
14856
Victor Stinnera47082312012-10-04 02:19:54 +020014857/* Parse options of an argument: flags, width, precision.
14858 Handle also "%(name)" syntax.
14859
14860 Return 0 if the argument has been formatted into arg->str.
14861 Return 1 if the argument has been written into ctx->writer,
14862 Raise an exception and return -1 on error. */
14863static int
14864unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14865 struct unicode_format_arg_t *arg)
14866{
14867#define FORMAT_READ(ctx) \
14868 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14869
14870 PyObject *v;
14871
Victor Stinnera47082312012-10-04 02:19:54 +020014872 if (arg->ch == '(') {
14873 /* Get argument value from a dictionary. Example: "%(name)s". */
14874 Py_ssize_t keystart;
14875 Py_ssize_t keylen;
14876 PyObject *key;
14877 int pcount = 1;
14878
14879 if (ctx->dict == NULL) {
14880 PyErr_SetString(PyExc_TypeError,
14881 "format requires a mapping");
14882 return -1;
14883 }
14884 ++ctx->fmtpos;
14885 --ctx->fmtcnt;
14886 keystart = ctx->fmtpos;
14887 /* Skip over balanced parentheses */
14888 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14889 arg->ch = FORMAT_READ(ctx);
14890 if (arg->ch == ')')
14891 --pcount;
14892 else if (arg->ch == '(')
14893 ++pcount;
14894 ctx->fmtpos++;
14895 }
14896 keylen = ctx->fmtpos - keystart - 1;
14897 if (ctx->fmtcnt < 0 || pcount > 0) {
14898 PyErr_SetString(PyExc_ValueError,
14899 "incomplete format key");
14900 return -1;
14901 }
14902 key = PyUnicode_Substring(ctx->fmtstr,
14903 keystart, keystart + keylen);
14904 if (key == NULL)
14905 return -1;
14906 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014907 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014908 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014909 }
14910 ctx->args = PyObject_GetItem(ctx->dict, key);
14911 Py_DECREF(key);
14912 if (ctx->args == NULL)
14913 return -1;
14914 ctx->args_owned = 1;
14915 ctx->arglen = -1;
14916 ctx->argidx = -2;
14917 }
14918
14919 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014920 while (--ctx->fmtcnt >= 0) {
14921 arg->ch = FORMAT_READ(ctx);
14922 ctx->fmtpos++;
14923 switch (arg->ch) {
14924 case '-': arg->flags |= F_LJUST; continue;
14925 case '+': arg->flags |= F_SIGN; continue;
14926 case ' ': arg->flags |= F_BLANK; continue;
14927 case '#': arg->flags |= F_ALT; continue;
14928 case '0': arg->flags |= F_ZERO; continue;
14929 }
14930 break;
14931 }
14932
14933 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014934 if (arg->ch == '*') {
14935 v = unicode_format_getnextarg(ctx);
14936 if (v == NULL)
14937 return -1;
14938 if (!PyLong_Check(v)) {
14939 PyErr_SetString(PyExc_TypeError,
14940 "* wants int");
14941 return -1;
14942 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014943 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014944 if (arg->width == -1 && PyErr_Occurred())
14945 return -1;
14946 if (arg->width < 0) {
14947 arg->flags |= F_LJUST;
14948 arg->width = -arg->width;
14949 }
14950 if (--ctx->fmtcnt >= 0) {
14951 arg->ch = FORMAT_READ(ctx);
14952 ctx->fmtpos++;
14953 }
14954 }
14955 else if (arg->ch >= '0' && arg->ch <= '9') {
14956 arg->width = arg->ch - '0';
14957 while (--ctx->fmtcnt >= 0) {
14958 arg->ch = FORMAT_READ(ctx);
14959 ctx->fmtpos++;
14960 if (arg->ch < '0' || arg->ch > '9')
14961 break;
14962 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14963 mixing signed and unsigned comparison. Since arg->ch is between
14964 '0' and '9', casting to int is safe. */
14965 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14966 PyErr_SetString(PyExc_ValueError,
14967 "width too big");
14968 return -1;
14969 }
14970 arg->width = arg->width*10 + (arg->ch - '0');
14971 }
14972 }
14973
14974 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014975 if (arg->ch == '.') {
14976 arg->prec = 0;
14977 if (--ctx->fmtcnt >= 0) {
14978 arg->ch = FORMAT_READ(ctx);
14979 ctx->fmtpos++;
14980 }
14981 if (arg->ch == '*') {
14982 v = unicode_format_getnextarg(ctx);
14983 if (v == NULL)
14984 return -1;
14985 if (!PyLong_Check(v)) {
14986 PyErr_SetString(PyExc_TypeError,
14987 "* wants int");
14988 return -1;
14989 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014990 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014991 if (arg->prec == -1 && PyErr_Occurred())
14992 return -1;
14993 if (arg->prec < 0)
14994 arg->prec = 0;
14995 if (--ctx->fmtcnt >= 0) {
14996 arg->ch = FORMAT_READ(ctx);
14997 ctx->fmtpos++;
14998 }
14999 }
15000 else if (arg->ch >= '0' && arg->ch <= '9') {
15001 arg->prec = arg->ch - '0';
15002 while (--ctx->fmtcnt >= 0) {
15003 arg->ch = FORMAT_READ(ctx);
15004 ctx->fmtpos++;
15005 if (arg->ch < '0' || arg->ch > '9')
15006 break;
15007 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15008 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015009 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015010 return -1;
15011 }
15012 arg->prec = arg->prec*10 + (arg->ch - '0');
15013 }
15014 }
15015 }
15016
15017 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15018 if (ctx->fmtcnt >= 0) {
15019 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15020 if (--ctx->fmtcnt >= 0) {
15021 arg->ch = FORMAT_READ(ctx);
15022 ctx->fmtpos++;
15023 }
15024 }
15025 }
15026 if (ctx->fmtcnt < 0) {
15027 PyErr_SetString(PyExc_ValueError,
15028 "incomplete format");
15029 return -1;
15030 }
15031 return 0;
15032
15033#undef FORMAT_READ
15034}
15035
15036/* Format one argument. Supported conversion specifiers:
15037
15038 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015039 - "i", "d", "u": int or float
15040 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015041 - "e", "E", "f", "F", "g", "G": float
15042 - "c": int or str (1 character)
15043
Victor Stinner8dbd4212012-12-04 09:30:24 +010015044 When possible, the output is written directly into the Unicode writer
15045 (ctx->writer). A string is created when padding is required.
15046
Victor Stinnera47082312012-10-04 02:19:54 +020015047 Return 0 if the argument has been formatted into *p_str,
15048 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015049 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015050static int
15051unicode_format_arg_format(struct unicode_formatter_t *ctx,
15052 struct unicode_format_arg_t *arg,
15053 PyObject **p_str)
15054{
15055 PyObject *v;
15056 _PyUnicodeWriter *writer = &ctx->writer;
15057
15058 if (ctx->fmtcnt == 0)
15059 ctx->writer.overallocate = 0;
15060
Victor Stinnera47082312012-10-04 02:19:54 +020015061 v = unicode_format_getnextarg(ctx);
15062 if (v == NULL)
15063 return -1;
15064
Victor Stinnera47082312012-10-04 02:19:54 +020015065
15066 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015067 case 's':
15068 case 'r':
15069 case 'a':
15070 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15071 /* Fast path */
15072 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15073 return -1;
15074 return 1;
15075 }
15076
15077 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15078 *p_str = v;
15079 Py_INCREF(*p_str);
15080 }
15081 else {
15082 if (arg->ch == 's')
15083 *p_str = PyObject_Str(v);
15084 else if (arg->ch == 'r')
15085 *p_str = PyObject_Repr(v);
15086 else
15087 *p_str = PyObject_ASCII(v);
15088 }
15089 break;
15090
15091 case 'i':
15092 case 'd':
15093 case 'u':
15094 case 'o':
15095 case 'x':
15096 case 'X':
15097 {
15098 int ret = mainformatlong(v, arg, p_str, writer);
15099 if (ret != 0)
15100 return ret;
15101 arg->sign = 1;
15102 break;
15103 }
15104
15105 case 'e':
15106 case 'E':
15107 case 'f':
15108 case 'F':
15109 case 'g':
15110 case 'G':
15111 if (arg->width == -1 && arg->prec == -1
15112 && !(arg->flags & (F_SIGN | F_BLANK)))
15113 {
15114 /* Fast path */
15115 if (formatfloat(v, arg, NULL, writer) == -1)
15116 return -1;
15117 return 1;
15118 }
15119
15120 arg->sign = 1;
15121 if (formatfloat(v, arg, p_str, NULL) == -1)
15122 return -1;
15123 break;
15124
15125 case 'c':
15126 {
15127 Py_UCS4 ch = formatchar(v);
15128 if (ch == (Py_UCS4) -1)
15129 return -1;
15130 if (arg->width == -1 && arg->prec == -1) {
15131 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015132 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015133 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015134 return 1;
15135 }
15136 *p_str = PyUnicode_FromOrdinal(ch);
15137 break;
15138 }
15139
15140 default:
15141 PyErr_Format(PyExc_ValueError,
15142 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015143 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015144 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15145 (int)arg->ch,
15146 ctx->fmtpos - 1);
15147 return -1;
15148 }
15149 if (*p_str == NULL)
15150 return -1;
15151 assert (PyUnicode_Check(*p_str));
15152 return 0;
15153}
15154
15155static int
15156unicode_format_arg_output(struct unicode_formatter_t *ctx,
15157 struct unicode_format_arg_t *arg,
15158 PyObject *str)
15159{
15160 Py_ssize_t len;
15161 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015162 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015163 Py_ssize_t pindex;
15164 Py_UCS4 signchar;
15165 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015166 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015167 Py_ssize_t sublen;
15168 _PyUnicodeWriter *writer = &ctx->writer;
15169 Py_UCS4 fill;
15170
15171 fill = ' ';
15172 if (arg->sign && arg->flags & F_ZERO)
15173 fill = '0';
15174
15175 if (PyUnicode_READY(str) == -1)
15176 return -1;
15177
15178 len = PyUnicode_GET_LENGTH(str);
15179 if ((arg->width == -1 || arg->width <= len)
15180 && (arg->prec == -1 || arg->prec >= len)
15181 && !(arg->flags & (F_SIGN | F_BLANK)))
15182 {
15183 /* Fast path */
15184 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15185 return -1;
15186 return 0;
15187 }
15188
15189 /* Truncate the string for "s", "r" and "a" formats
15190 if the precision is set */
15191 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15192 if (arg->prec >= 0 && len > arg->prec)
15193 len = arg->prec;
15194 }
15195
15196 /* Adjust sign and width */
15197 kind = PyUnicode_KIND(str);
15198 pbuf = PyUnicode_DATA(str);
15199 pindex = 0;
15200 signchar = '\0';
15201 if (arg->sign) {
15202 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15203 if (ch == '-' || ch == '+') {
15204 signchar = ch;
15205 len--;
15206 pindex++;
15207 }
15208 else if (arg->flags & F_SIGN)
15209 signchar = '+';
15210 else if (arg->flags & F_BLANK)
15211 signchar = ' ';
15212 else
15213 arg->sign = 0;
15214 }
15215 if (arg->width < len)
15216 arg->width = len;
15217
15218 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015219 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015220 if (!(arg->flags & F_LJUST)) {
15221 if (arg->sign) {
15222 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015223 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015224 }
15225 else {
15226 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015227 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015228 }
15229 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015230 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15231 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015232 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015233 }
15234
Victor Stinnera47082312012-10-04 02:19:54 +020015235 buflen = arg->width;
15236 if (arg->sign && len == arg->width)
15237 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015238 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015239 return -1;
15240
15241 /* Write the sign if needed */
15242 if (arg->sign) {
15243 if (fill != ' ') {
15244 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15245 writer->pos += 1;
15246 }
15247 if (arg->width > len)
15248 arg->width--;
15249 }
15250
15251 /* Write the numeric prefix for "x", "X" and "o" formats
15252 if the alternate form is used.
15253 For example, write "0x" for the "%#x" format. */
15254 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15255 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15256 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15257 if (fill != ' ') {
15258 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15259 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15260 writer->pos += 2;
15261 pindex += 2;
15262 }
15263 arg->width -= 2;
15264 if (arg->width < 0)
15265 arg->width = 0;
15266 len -= 2;
15267 }
15268
15269 /* Pad left with the fill character if needed */
15270 if (arg->width > len && !(arg->flags & F_LJUST)) {
15271 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015272 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015273 writer->pos += sublen;
15274 arg->width = len;
15275 }
15276
15277 /* If padding with spaces: write sign if needed and/or numeric prefix if
15278 the alternate form is used */
15279 if (fill == ' ') {
15280 if (arg->sign) {
15281 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15282 writer->pos += 1;
15283 }
15284 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15285 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15286 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15287 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15288 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15289 writer->pos += 2;
15290 pindex += 2;
15291 }
15292 }
15293
15294 /* Write characters */
15295 if (len) {
15296 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15297 str, pindex, len);
15298 writer->pos += len;
15299 }
15300
15301 /* Pad right with the fill character if needed */
15302 if (arg->width > len) {
15303 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015304 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015305 writer->pos += sublen;
15306 }
15307 return 0;
15308}
15309
15310/* Helper of PyUnicode_Format(): format one arg.
15311 Return 0 on success, raise an exception and return -1 on error. */
15312static int
15313unicode_format_arg(struct unicode_formatter_t *ctx)
15314{
15315 struct unicode_format_arg_t arg;
15316 PyObject *str;
15317 int ret;
15318
Victor Stinner8dbd4212012-12-04 09:30:24 +010015319 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015320 if (arg.ch == '%') {
15321 ctx->fmtpos++;
15322 ctx->fmtcnt--;
15323 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15324 return -1;
15325 return 0;
15326 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015327 arg.flags = 0;
15328 arg.width = -1;
15329 arg.prec = -1;
15330 arg.sign = 0;
15331 str = NULL;
15332
Victor Stinnera47082312012-10-04 02:19:54 +020015333 ret = unicode_format_arg_parse(ctx, &arg);
15334 if (ret == -1)
15335 return -1;
15336
15337 ret = unicode_format_arg_format(ctx, &arg, &str);
15338 if (ret == -1)
15339 return -1;
15340
15341 if (ret != 1) {
15342 ret = unicode_format_arg_output(ctx, &arg, str);
15343 Py_DECREF(str);
15344 if (ret == -1)
15345 return -1;
15346 }
15347
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015348 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015349 PyErr_SetString(PyExc_TypeError,
15350 "not all arguments converted during string formatting");
15351 return -1;
15352 }
15353 return 0;
15354}
15355
Alexander Belopolsky40018472011-02-26 01:02:56 +000015356PyObject *
15357PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015358{
Victor Stinnera47082312012-10-04 02:19:54 +020015359 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015360
Guido van Rossumd57fd912000-03-10 22:53:23 +000015361 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015362 PyErr_BadInternalCall();
15363 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015364 }
Victor Stinnera47082312012-10-04 02:19:54 +020015365
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015366 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015367 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015368
15369 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015370 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15371 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15372 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15373 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015374
Victor Stinner8f674cc2013-04-17 23:02:17 +020015375 _PyUnicodeWriter_Init(&ctx.writer);
15376 ctx.writer.min_length = ctx.fmtcnt + 100;
15377 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015378
Guido van Rossumd57fd912000-03-10 22:53:23 +000015379 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015380 ctx.arglen = PyTuple_Size(args);
15381 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015382 }
15383 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015384 ctx.arglen = -1;
15385 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015386 }
Victor Stinnera47082312012-10-04 02:19:54 +020015387 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015388 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015389 ctx.dict = args;
15390 else
15391 ctx.dict = NULL;
15392 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015393
Victor Stinnera47082312012-10-04 02:19:54 +020015394 while (--ctx.fmtcnt >= 0) {
15395 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015396 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015397
15398 nonfmtpos = ctx.fmtpos++;
15399 while (ctx.fmtcnt >= 0 &&
15400 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15401 ctx.fmtpos++;
15402 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 }
Victor Stinnera47082312012-10-04 02:19:54 +020015404 if (ctx.fmtcnt < 0) {
15405 ctx.fmtpos--;
15406 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015407 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015408
Victor Stinnercfc4c132013-04-03 01:48:39 +020015409 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15410 nonfmtpos, ctx.fmtpos) < 0)
15411 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015412 }
15413 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015414 ctx.fmtpos++;
15415 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015416 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015417 }
15418 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015419
Victor Stinnera47082312012-10-04 02:19:54 +020015420 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015421 PyErr_SetString(PyExc_TypeError,
15422 "not all arguments converted during string formatting");
15423 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015424 }
15425
Victor Stinnera47082312012-10-04 02:19:54 +020015426 if (ctx.args_owned) {
15427 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015428 }
Victor Stinnera47082312012-10-04 02:19:54 +020015429 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015430
Benjamin Peterson29060642009-01-31 22:14:21 +000015431 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015432 _PyUnicodeWriter_Dealloc(&ctx.writer);
15433 if (ctx.args_owned) {
15434 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015435 }
15436 return NULL;
15437}
15438
Jeremy Hylton938ace62002-07-17 16:30:39 +000015439static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015440unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15441
Tim Peters6d6c1a32001-08-02 04:15:00 +000015442static PyObject *
15443unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15444{
Benjamin Peterson29060642009-01-31 22:14:21 +000015445 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 static char *kwlist[] = {"object", "encoding", "errors", 0};
15447 char *encoding = NULL;
15448 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015449
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 if (type != &PyUnicode_Type)
15451 return unicode_subtype_new(type, args, kwds);
15452 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015453 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015454 return NULL;
15455 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015456 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015457 if (encoding == NULL && errors == NULL)
15458 return PyObject_Str(x);
15459 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015460 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015461}
15462
Guido van Rossume023fe02001-08-30 03:12:59 +000015463static PyObject *
15464unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15465{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015466 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015467 Py_ssize_t length, char_size;
15468 int share_wstr, share_utf8;
15469 unsigned int kind;
15470 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015471
Benjamin Peterson14339b62009-01-31 16:36:08 +000015472 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015473
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015474 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015475 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015477 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015478 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015479 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015480 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015481 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015482
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015483 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015484 if (self == NULL) {
15485 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015486 return NULL;
15487 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015488 kind = PyUnicode_KIND(unicode);
15489 length = PyUnicode_GET_LENGTH(unicode);
15490
15491 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015492#ifdef Py_DEBUG
15493 _PyUnicode_HASH(self) = -1;
15494#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015495 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015496#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015497 _PyUnicode_STATE(self).interned = 0;
15498 _PyUnicode_STATE(self).kind = kind;
15499 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015500 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015501 _PyUnicode_STATE(self).ready = 1;
15502 _PyUnicode_WSTR(self) = NULL;
15503 _PyUnicode_UTF8_LENGTH(self) = 0;
15504 _PyUnicode_UTF8(self) = NULL;
15505 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015506 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015507
15508 share_utf8 = 0;
15509 share_wstr = 0;
15510 if (kind == PyUnicode_1BYTE_KIND) {
15511 char_size = 1;
15512 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15513 share_utf8 = 1;
15514 }
15515 else if (kind == PyUnicode_2BYTE_KIND) {
15516 char_size = 2;
15517 if (sizeof(wchar_t) == 2)
15518 share_wstr = 1;
15519 }
15520 else {
15521 assert(kind == PyUnicode_4BYTE_KIND);
15522 char_size = 4;
15523 if (sizeof(wchar_t) == 4)
15524 share_wstr = 1;
15525 }
15526
15527 /* Ensure we won't overflow the length. */
15528 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15529 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015530 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015531 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015532 data = PyObject_MALLOC((length + 1) * char_size);
15533 if (data == NULL) {
15534 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015535 goto onError;
15536 }
15537
Victor Stinnerc3c74152011-10-02 20:39:55 +020015538 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015539 if (share_utf8) {
15540 _PyUnicode_UTF8_LENGTH(self) = length;
15541 _PyUnicode_UTF8(self) = data;
15542 }
15543 if (share_wstr) {
15544 _PyUnicode_WSTR_LENGTH(self) = length;
15545 _PyUnicode_WSTR(self) = (wchar_t *)data;
15546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015547
Christian Heimesf051e432016-09-13 20:22:02 +020015548 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015549 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015550 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015551#ifdef Py_DEBUG
15552 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15553#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015554 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015555 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015556
15557onError:
15558 Py_DECREF(unicode);
15559 Py_DECREF(self);
15560 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015561}
15562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015563PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015564"str(object='') -> str\n\
15565str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015566\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015567Create a new string object from the given object. If encoding or\n\
15568errors is specified, then the object must expose a data buffer\n\
15569that will be decoded using the given encoding and error handler.\n\
15570Otherwise, returns the result of object.__str__() (if defined)\n\
15571or repr(object).\n\
15572encoding defaults to sys.getdefaultencoding().\n\
15573errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015574
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015575static PyObject *unicode_iter(PyObject *seq);
15576
Guido van Rossumd57fd912000-03-10 22:53:23 +000015577PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015578 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015579 "str", /* tp_name */
15580 sizeof(PyUnicodeObject), /* tp_basicsize */
15581 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015582 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015583 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015584 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015585 0, /* tp_getattr */
15586 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015587 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015588 unicode_repr, /* tp_repr */
15589 &unicode_as_number, /* tp_as_number */
15590 &unicode_as_sequence, /* tp_as_sequence */
15591 &unicode_as_mapping, /* tp_as_mapping */
15592 (hashfunc) unicode_hash, /* tp_hash*/
15593 0, /* tp_call*/
15594 (reprfunc) unicode_str, /* tp_str */
15595 PyObject_GenericGetAttr, /* tp_getattro */
15596 0, /* tp_setattro */
15597 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015598 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015599 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15600 unicode_doc, /* tp_doc */
15601 0, /* tp_traverse */
15602 0, /* tp_clear */
15603 PyUnicode_RichCompare, /* tp_richcompare */
15604 0, /* tp_weaklistoffset */
15605 unicode_iter, /* tp_iter */
15606 0, /* tp_iternext */
15607 unicode_methods, /* tp_methods */
15608 0, /* tp_members */
15609 0, /* tp_getset */
15610 &PyBaseObject_Type, /* tp_base */
15611 0, /* tp_dict */
15612 0, /* tp_descr_get */
15613 0, /* tp_descr_set */
15614 0, /* tp_dictoffset */
15615 0, /* tp_init */
15616 0, /* tp_alloc */
15617 unicode_new, /* tp_new */
15618 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015619};
15620
15621/* Initialize the Unicode implementation */
15622
Victor Stinner331a6a52019-05-27 16:39:22 +020015623PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015624_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015625{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015626 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015627 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015628 0x000A, /* LINE FEED */
15629 0x000D, /* CARRIAGE RETURN */
15630 0x001C, /* FILE SEPARATOR */
15631 0x001D, /* GROUP SEPARATOR */
15632 0x001E, /* RECORD SEPARATOR */
15633 0x0085, /* NEXT LINE */
15634 0x2028, /* LINE SEPARATOR */
15635 0x2029, /* PARAGRAPH SEPARATOR */
15636 };
15637
Victor Stinner91698d82020-06-25 14:07:40 +020015638 struct _Py_unicode_state *state = &tstate->interp->unicode;
15639 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015640 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015641 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015642
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015643 if (_Py_IsMainInterpreter(tstate)) {
15644 /* initialize the linebreak bloom filter */
15645 bloom_linebreak = make_bloom_mask(
15646 PyUnicode_2BYTE_KIND, linebreak,
15647 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015648
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015649 if (PyType_Ready(&PyUnicode_Type) < 0) {
15650 return _PyStatus_ERR("Can't initialize unicode type");
15651 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015652
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015653 if (PyType_Ready(&EncodingMapType) < 0) {
15654 return _PyStatus_ERR("Can't initialize encoding map type");
15655 }
15656 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15657 return _PyStatus_ERR("Can't initialize field name iterator type");
15658 }
15659 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15660 return _PyStatus_ERR("Can't initialize formatter iter type");
15661 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015662 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015663 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015664}
15665
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015666
Walter Dörwald16807132007-05-25 13:52:07 +000015667void
15668PyUnicode_InternInPlace(PyObject **p)
15669{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015670 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015671#ifdef Py_DEBUG
15672 assert(s != NULL);
15673 assert(_PyUnicode_CHECK(s));
15674#else
Victor Stinner607b1022020-05-05 18:50:30 +020015675 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015676 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015677 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015678#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015679
Benjamin Peterson14339b62009-01-31 16:36:08 +000015680 /* If it's a subclass, we don't really know what putting
15681 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015682 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015683 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015684 }
15685
15686 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015687 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015688 }
15689
15690#ifdef INTERNED_STRINGS
Victor Stinner666ecfb2020-07-02 01:19:57 +020015691 if (PyUnicode_READY(s) == -1) {
15692 PyErr_Clear();
15693 return;
15694 }
15695
Benjamin Peterson14339b62009-01-31 16:36:08 +000015696 if (interned == NULL) {
15697 interned = PyDict_New();
15698 if (interned == NULL) {
15699 PyErr_Clear(); /* Don't leave an exception */
15700 return;
15701 }
15702 }
Victor Stinner607b1022020-05-05 18:50:30 +020015703
15704 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015705 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015706 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015707 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015708
Berker Peksagced8d4c2016-07-25 04:40:39 +030015709 if (t == NULL) {
15710 PyErr_Clear();
15711 return;
15712 }
Victor Stinner607b1022020-05-05 18:50:30 +020015713
Berker Peksagced8d4c2016-07-25 04:40:39 +030015714 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015715 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015716 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015717 return;
15718 }
Victor Stinner607b1022020-05-05 18:50:30 +020015719
Victor Stinner3549ca32020-07-03 16:59:12 +020015720 /* The two references in interned dict (key and value) are not counted by
15721 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15722 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015723 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015724 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015725#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015726}
15727
15728void
15729PyUnicode_InternImmortal(PyObject **p)
15730{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015731 PyUnicode_InternInPlace(p);
15732 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015733 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015734 Py_INCREF(*p);
15735 }
Walter Dörwald16807132007-05-25 13:52:07 +000015736}
15737
15738PyObject *
15739PyUnicode_InternFromString(const char *cp)
15740{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015741 PyObject *s = PyUnicode_FromString(cp);
15742 if (s == NULL)
15743 return NULL;
15744 PyUnicode_InternInPlace(&s);
15745 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015746}
15747
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015748
Victor Stinner666ecfb2020-07-02 01:19:57 +020015749void
15750_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015751{
Victor Stinner666ecfb2020-07-02 01:19:57 +020015752 if (!_Py_IsMainInterpreter(tstate)) {
15753 // interned dict is shared by all interpreters
Benjamin Peterson14339b62009-01-31 16:36:08 +000015754 return;
15755 }
Walter Dörwald16807132007-05-25 13:52:07 +000015756
Victor Stinner666ecfb2020-07-02 01:19:57 +020015757 if (interned == NULL) {
15758 return;
15759 }
15760 assert(PyDict_CheckExact(interned));
15761
15762 PyObject *keys = PyDict_Keys(interned);
15763 if (keys == NULL) {
15764 PyErr_Clear();
15765 return;
15766 }
15767 assert(PyList_CheckExact(keys));
15768
15769 /* Interned unicode strings are not forcibly deallocated; rather, we give
15770 them their stolen references back, and then clear and DECREF the
15771 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015772
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015773 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015774#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015775 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015776
15777 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015778#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015779 for (Py_ssize_t i = 0; i < n; i++) {
15780 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015781 assert(PyUnicode_IS_READY(s));
15782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015783 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015784 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015785 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015786#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015787 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015788#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015789 break;
15790 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015791 // Restore the two references (key and value) ignored
15792 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015793 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015794#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015795 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015796#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015797 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015798 case SSTATE_NOT_INTERNED:
15799 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015800 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015801 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015803 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015804 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015805#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015806 fprintf(stderr,
15807 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15808 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015809#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015810 Py_DECREF(keys);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015811
Benjamin Peterson14339b62009-01-31 16:36:08 +000015812 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015813 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015814}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015815
15816
15817/********************* Unicode Iterator **************************/
15818
15819typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015820 PyObject_HEAD
15821 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015822 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015823} unicodeiterobject;
15824
15825static void
15826unicodeiter_dealloc(unicodeiterobject *it)
15827{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015828 _PyObject_GC_UNTRACK(it);
15829 Py_XDECREF(it->it_seq);
15830 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015831}
15832
15833static int
15834unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15835{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015836 Py_VISIT(it->it_seq);
15837 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015838}
15839
15840static PyObject *
15841unicodeiter_next(unicodeiterobject *it)
15842{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015843 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015844
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 assert(it != NULL);
15846 seq = it->it_seq;
15847 if (seq == NULL)
15848 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015849 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015851 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15852 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015853 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015854 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15855 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015856 if (item != NULL)
15857 ++it->it_index;
15858 return item;
15859 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015860
Benjamin Peterson14339b62009-01-31 16:36:08 +000015861 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015862 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015863 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015864}
15865
15866static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015867unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015868{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015869 Py_ssize_t len = 0;
15870 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015871 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015872 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015873}
15874
15875PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15876
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015877static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015878unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015879{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015880 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015881 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015882 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015883 it->it_seq, it->it_index);
15884 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015885 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015886 if (u == NULL)
15887 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015888 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015889 }
15890}
15891
15892PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15893
15894static PyObject *
15895unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15896{
15897 Py_ssize_t index = PyLong_AsSsize_t(state);
15898 if (index == -1 && PyErr_Occurred())
15899 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015900 if (it->it_seq != NULL) {
15901 if (index < 0)
15902 index = 0;
15903 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15904 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15905 it->it_index = index;
15906 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015907 Py_RETURN_NONE;
15908}
15909
15910PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15911
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015912static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015913 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015914 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015915 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15916 reduce_doc},
15917 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15918 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015919 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015920};
15921
15922PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015923 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15924 "str_iterator", /* tp_name */
15925 sizeof(unicodeiterobject), /* tp_basicsize */
15926 0, /* tp_itemsize */
15927 /* methods */
15928 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015929 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015930 0, /* tp_getattr */
15931 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015932 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015933 0, /* tp_repr */
15934 0, /* tp_as_number */
15935 0, /* tp_as_sequence */
15936 0, /* tp_as_mapping */
15937 0, /* tp_hash */
15938 0, /* tp_call */
15939 0, /* tp_str */
15940 PyObject_GenericGetAttr, /* tp_getattro */
15941 0, /* tp_setattro */
15942 0, /* tp_as_buffer */
15943 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15944 0, /* tp_doc */
15945 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15946 0, /* tp_clear */
15947 0, /* tp_richcompare */
15948 0, /* tp_weaklistoffset */
15949 PyObject_SelfIter, /* tp_iter */
15950 (iternextfunc)unicodeiter_next, /* tp_iternext */
15951 unicodeiter_methods, /* tp_methods */
15952 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015953};
15954
15955static PyObject *
15956unicode_iter(PyObject *seq)
15957{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015958 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015959
Benjamin Peterson14339b62009-01-31 16:36:08 +000015960 if (!PyUnicode_Check(seq)) {
15961 PyErr_BadInternalCall();
15962 return NULL;
15963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015964 if (PyUnicode_READY(seq) == -1)
15965 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015966 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15967 if (it == NULL)
15968 return NULL;
15969 it->it_index = 0;
15970 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015971 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015972 _PyObject_GC_TRACK(it);
15973 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015974}
15975
Victor Stinner709d23d2019-05-02 14:56:30 -040015976static int
15977encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015978{
Victor Stinner709d23d2019-05-02 14:56:30 -040015979 int res;
15980 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15981 if (res == -2) {
15982 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15983 return -1;
15984 }
15985 if (res < 0) {
15986 PyErr_NoMemory();
15987 return -1;
15988 }
15989 return 0;
15990}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015991
Victor Stinner709d23d2019-05-02 14:56:30 -040015992
15993static int
15994config_get_codec_name(wchar_t **config_encoding)
15995{
15996 char *encoding;
15997 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15998 return -1;
15999 }
16000
16001 PyObject *name_obj = NULL;
16002 PyObject *codec = _PyCodec_Lookup(encoding);
16003 PyMem_RawFree(encoding);
16004
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016005 if (!codec)
16006 goto error;
16007
16008 name_obj = PyObject_GetAttrString(codec, "name");
16009 Py_CLEAR(codec);
16010 if (!name_obj) {
16011 goto error;
16012 }
16013
Victor Stinner709d23d2019-05-02 14:56:30 -040016014 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16015 Py_DECREF(name_obj);
16016 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016017 goto error;
16018 }
16019
Victor Stinner709d23d2019-05-02 14:56:30 -040016020 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16021 if (raw_wname == NULL) {
16022 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016023 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016024 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016025 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016026
16027 PyMem_RawFree(*config_encoding);
16028 *config_encoding = raw_wname;
16029
16030 PyMem_Free(wname);
16031 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016032
16033error:
16034 Py_XDECREF(codec);
16035 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016036 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016037}
16038
16039
Victor Stinner331a6a52019-05-27 16:39:22 +020016040static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016041init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016042{
Victor Stinner709d23d2019-05-02 14:56:30 -040016043 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016044 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016045 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016046 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016047 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016048 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016049 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016050}
16051
16052
Victor Stinner709d23d2019-05-02 14:56:30 -040016053static int
16054init_fs_codec(PyInterpreterState *interp)
16055{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016056 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016057
16058 _Py_error_handler error_handler;
16059 error_handler = get_error_handler_wide(config->filesystem_errors);
16060 if (error_handler == _Py_ERROR_UNKNOWN) {
16061 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16062 return -1;
16063 }
16064
16065 char *encoding, *errors;
16066 if (encode_wstr_utf8(config->filesystem_encoding,
16067 &encoding,
16068 "filesystem_encoding") < 0) {
16069 return -1;
16070 }
16071
16072 if (encode_wstr_utf8(config->filesystem_errors,
16073 &errors,
16074 "filesystem_errors") < 0) {
16075 PyMem_RawFree(encoding);
16076 return -1;
16077 }
16078
Victor Stinner3d17c042020-05-14 01:48:38 +020016079 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16080 PyMem_RawFree(fs_codec->encoding);
16081 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016082 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016083 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16084 PyMem_RawFree(fs_codec->errors);
16085 fs_codec->errors = errors;
16086 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016087
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016088#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016089 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016090#endif
16091
Victor Stinner709d23d2019-05-02 14:56:30 -040016092 /* At this point, PyUnicode_EncodeFSDefault() and
16093 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16094 the C implementation of the filesystem encoding. */
16095
16096 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16097 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016098 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16099 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016100 PyErr_NoMemory();
16101 return -1;
16102 }
16103 return 0;
16104}
16105
16106
Victor Stinner331a6a52019-05-27 16:39:22 +020016107static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016108init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016109{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016110 PyInterpreterState *interp = tstate->interp;
16111
Victor Stinner709d23d2019-05-02 14:56:30 -040016112 /* Update the filesystem encoding to the normalized Python codec name.
16113 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16114 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016115 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016116 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016117 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016118 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016119 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016120 }
16121
Victor Stinner709d23d2019-05-02 14:56:30 -040016122 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016123 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016124 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016125 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016126}
16127
16128
Victor Stinner331a6a52019-05-27 16:39:22 +020016129PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016130_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016131{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016132 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016133 if (_PyStatus_EXCEPTION(status)) {
16134 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016135 }
16136
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016137 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016138}
16139
16140
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016141static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016142_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016143{
Victor Stinner3d17c042020-05-14 01:48:38 +020016144 PyMem_RawFree(fs_codec->encoding);
16145 fs_codec->encoding = NULL;
16146 fs_codec->utf8 = 0;
16147 PyMem_RawFree(fs_codec->errors);
16148 fs_codec->errors = NULL;
16149 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016150}
16151
16152
Victor Stinner709d23d2019-05-02 14:56:30 -040016153#ifdef MS_WINDOWS
16154int
16155_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16156{
Victor Stinner81a7be32020-04-14 15:14:01 +020016157 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016158 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016159
16160 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16161 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16162 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16163 if (encoding == NULL || errors == NULL) {
16164 PyMem_RawFree(encoding);
16165 PyMem_RawFree(errors);
16166 PyErr_NoMemory();
16167 return -1;
16168 }
16169
16170 PyMem_RawFree(config->filesystem_encoding);
16171 config->filesystem_encoding = encoding;
16172 PyMem_RawFree(config->filesystem_errors);
16173 config->filesystem_errors = errors;
16174
16175 return init_fs_codec(interp);
16176}
16177#endif
16178
16179
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016180void
Victor Stinner3d483342019-11-22 12:27:50 +010016181_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016182{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016183 // _PyUnicode_ClearInterned() must be called before
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016184
Victor Stinner666ecfb2020-07-02 01:19:57 +020016185 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016186
Victor Stinner91698d82020-06-25 14:07:40 +020016187 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016188
Victor Stinner2f9ada92020-06-24 02:22:21 +020016189 for (Py_ssize_t i = 0; i < 256; i++) {
16190 Py_CLEAR(state->latin1[i]);
16191 }
16192
Victor Stinner666ecfb2020-07-02 01:19:57 +020016193 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016194 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016195 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016196
Victor Stinner3d17c042020-05-14 01:48:38 +020016197 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016198}
16199
16200
Georg Brandl66c221e2010-10-14 07:04:07 +000016201/* A _string module, to export formatter_parser and formatter_field_name_split
16202 to the string.Formatter class implemented in Python. */
16203
16204static PyMethodDef _string_methods[] = {
16205 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16206 METH_O, PyDoc_STR("split the argument as a field name")},
16207 {"formatter_parser", (PyCFunction) formatter_parser,
16208 METH_O, PyDoc_STR("parse the argument as a format string")},
16209 {NULL, NULL}
16210};
16211
16212static struct PyModuleDef _string_module = {
16213 PyModuleDef_HEAD_INIT,
16214 "_string",
16215 PyDoc_STR("string helper module"),
16216 0,
16217 _string_methods,
16218 NULL,
16219 NULL,
16220 NULL,
16221 NULL
16222};
16223
16224PyMODINIT_FUNC
16225PyInit__string(void)
16226{
16227 return PyModule_Create(&_string_module);
16228}
16229
16230
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016231#ifdef __cplusplus
16232}
16233#endif