blob: ba48d35aa40b190677e1e18309912845be8b0d1c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
44#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
46#include "pycore_interp.h" // PyInterpreterState.fs_codec
47#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
50#include "pycore_pystate.h" // _PyInterpreterState_GET()
51#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinner666ecfb2020-07-02 01:19:57 +020058/* Uncomment to display statistics on interned strings at exit
59 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010060/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 }
844 return str;
845}
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* --- Bloom Filters ----------------------------------------------------- */
848
849/* stuff to implement simple "bloom filters" for Unicode characters.
850 to keep things simple, we use a single bitmask, using the least 5
851 bits from each unicode characters as the bit index. */
852
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200853/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854
Antoine Pitrouf068f942010-01-13 14:19:12 +0000855#if LONG_BIT >= 128
856#define BLOOM_WIDTH 128
857#elif LONG_BIT >= 64
858#define BLOOM_WIDTH 64
859#elif LONG_BIT >= 32
860#define BLOOM_WIDTH 32
861#else
862#error "LONG_BIT is smaller than 32"
863#endif
864
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865#define BLOOM_MASK unsigned long
866
Serhiy Storchaka05997252013-01-26 12:14:02 +0200867static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870
Benjamin Peterson29060642009-01-31 22:14:21 +0000871#define BLOOM_LINEBREAK(ch) \
872 ((ch) < 128U ? ascii_linebreak[(ch)] : \
873 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700875static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300876make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877{
Victor Stinnera85af502013-04-09 21:53:54 +0200878#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
879 do { \
880 TYPE *data = (TYPE *)PTR; \
881 TYPE *end = data + LEN; \
882 Py_UCS4 ch; \
883 for (; data != end; data++) { \
884 ch = *data; \
885 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
886 } \
887 break; \
888 } while (0)
889
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 /* calculate simple bloom-style bitmask for a given unicode string */
891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
894 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200895 switch (kind) {
896 case PyUnicode_1BYTE_KIND:
897 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
898 break;
899 case PyUnicode_2BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
901 break;
902 case PyUnicode_4BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
904 break;
905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700906 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200909
910#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911}
912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300913static int
914ensure_unicode(PyObject *obj)
915{
916 if (!PyUnicode_Check(obj)) {
917 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200918 "must be str, not %.100s",
919 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920 return -1;
921 }
922 return PyUnicode_READY(obj);
923}
924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925/* Compilation of templated routines */
926
Victor Stinner90ed8a62020-06-24 00:34:07 +0200927#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/asciilib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs1lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
948#include "stringlib/ucs2lib.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/partition.h"
951#include "stringlib/split.h"
952#include "stringlib/count.h"
953#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300954#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200956#include "stringlib/undef.h"
957
958#include "stringlib/ucs4lib.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/partition.h"
961#include "stringlib/split.h"
962#include "stringlib/count.h"
963#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300964#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200965#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200966#include "stringlib/undef.h"
967
Inada Naoki2c4928d2020-06-17 20:09:44 +0900968_Py_COMP_DIAG_PUSH
969_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970#include "stringlib/unicodedefs.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/count.h"
973#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100974#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900975_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200977#undef STRINGLIB_GET_EMPTY
978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001227 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001228 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 }
1230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001231 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001232 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001233 return (PyUnicodeObject *)PyErr_NoMemory();
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 if (length < 0) {
1236 PyErr_SetString(PyExc_SystemError,
1237 "Negative size passed to _PyUnicode_New");
1238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1242 if (unicode == NULL)
1243 return NULL;
1244 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001245
1246 _PyUnicode_WSTR_LENGTH(unicode) = length;
1247 _PyUnicode_HASH(unicode) = -1;
1248 _PyUnicode_STATE(unicode).interned = 0;
1249 _PyUnicode_STATE(unicode).kind = 0;
1250 _PyUnicode_STATE(unicode).compact = 0;
1251 _PyUnicode_STATE(unicode).ready = 0;
1252 _PyUnicode_STATE(unicode).ascii = 0;
1253 _PyUnicode_DATA_ANY(unicode) = NULL;
1254 _PyUnicode_LENGTH(unicode) = 0;
1255 _PyUnicode_UTF8(unicode) = NULL;
1256 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1259 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001260 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001261 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001262 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264
Jeremy Hyltond8082792003-09-16 19:41:39 +00001265 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001266 * the caller fails before initializing str -- unicode_resize()
1267 * reads str[0], and the Keep-Alive optimization can keep memory
1268 * allocated for str alive across a call to unicode_dealloc(unicode).
1269 * We don't want unicode_resize to read uninitialized memory in
1270 * that case.
1271 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode)[0] = 0;
1273 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001274
Victor Stinner7931d9a2011-11-04 00:22:48 +01001275 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 return unicode;
1277}
1278
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279static const char*
1280unicode_kind_name(PyObject *unicode)
1281{
Victor Stinner42dfd712011-10-03 14:41:45 +02001282 /* don't check consistency: unicode_kind_name() is called from
1283 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 if (!PyUnicode_IS_COMPACT(unicode))
1285 {
1286 if (!PyUnicode_IS_READY(unicode))
1287 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 {
1290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "legacy ascii";
1293 else
1294 return "legacy latin1";
1295 case PyUnicode_2BYTE_KIND:
1296 return "legacy UCS2";
1297 case PyUnicode_4BYTE_KIND:
1298 return "legacy UCS4";
1299 default:
1300 return "<legacy invalid kind>";
1301 }
1302 }
1303 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001304 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001306 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 return "ascii";
1308 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 default:
1315 return "<invalid compact kind>";
1316 }
1317}
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001321const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001322 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return _PyUnicode_COMPACT_DATA(unicode);
1329}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001332 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1334 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1335 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1336 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1337 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1338 return PyUnicode_DATA(unicode);
1339}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001340
1341void
1342_PyUnicode_Dump(PyObject *op)
1343{
1344 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1346 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001347 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001350 {
1351 if (ascii->state.ascii)
1352 data = (ascii + 1);
1353 else
1354 data = (compact + 1);
1355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 else
1357 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001358 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->wstr == data)
1361 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001362 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera3b334d2011-10-03 13:53:37 +02001364 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001365 printf(" (%zu), ", compact->wstr_length);
1366 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 }
1369 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001370 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
1374
Victor Stinner91698d82020-06-25 14:07:40 +02001375static int
1376unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1377{
1378 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1379 // optimized to always use state->empty_string without having to check if
1380 // it is NULL or not.
1381 PyObject *empty = PyUnicode_New(1, 0);
1382 if (empty == NULL) {
1383 return -1;
1384 }
1385 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1386 _PyUnicode_LENGTH(empty) = 0;
1387 assert(_PyUnicode_CheckConsistency(empty, 1));
1388
1389 assert(state->empty_string == NULL);
1390 state->empty_string = empty;
1391 return 0;
1392}
1393
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyObject *
1396PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1397{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001398 /* Optimization for empty strings */
1399 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001400 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 }
1402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *obj;
1404 PyCompactUnicodeObject *unicode;
1405 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001406 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001407 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 Py_ssize_t char_size;
1409 Py_ssize_t struct_size;
1410
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 is_ascii = 0;
1412 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 struct_size = sizeof(PyCompactUnicodeObject);
1414 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001415 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 char_size = 1;
1417 is_ascii = 1;
1418 struct_size = sizeof(PyASCIIObject);
1419 }
1420 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001421 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 char_size = 1;
1423 }
1424 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 2;
1427 if (sizeof(wchar_t) == 2)
1428 is_sharing = 1;
1429 }
1430 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001431 if (maxchar > MAX_UNICODE) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "invalid maximum character passed to PyUnicode_New");
1434 return NULL;
1435 }
Victor Stinner8f825062012-04-27 13:55:39 +02001436 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 char_size = 4;
1438 if (sizeof(wchar_t) == 4)
1439 is_sharing = 1;
1440 }
1441
1442 /* Ensure we won't overflow the size. */
1443 if (size < 0) {
1444 PyErr_SetString(PyExc_SystemError,
1445 "Negative size passed to PyUnicode_New");
1446 return NULL;
1447 }
1448 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1449 return PyErr_NoMemory();
1450
1451 /* Duplicated allocation code from _PyObject_New() instead of a call to
1452 * PyObject_New() so we are able to allocate space for the object and
1453 * it's data buffer.
1454 */
1455 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001456 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001458 }
1459 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 unicode = (PyCompactUnicodeObject *)obj;
1462 if (is_ascii)
1463 data = ((PyASCIIObject*)obj) + 1;
1464 else
1465 data = unicode + 1;
1466 _PyUnicode_LENGTH(unicode) = size;
1467 _PyUnicode_HASH(unicode) = -1;
1468 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 _PyUnicode_STATE(unicode).compact = 1;
1471 _PyUnicode_STATE(unicode).ready = 1;
1472 _PyUnicode_STATE(unicode).ascii = is_ascii;
1473 if (is_ascii) {
1474 ((char*)data)[size] = 0;
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 }
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001482 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else {
1485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001489 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 ((Py_UCS4*)data)[size] = 0;
1491 if (is_sharing) {
1492 _PyUnicode_WSTR_LENGTH(unicode) = size;
1493 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1494 }
1495 else {
1496 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 }
1499 }
Victor Stinner8f825062012-04-27 13:55:39 +02001500#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001501 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001502#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return obj;
1505}
1506
1507#if SIZEOF_WCHAR_T == 2
1508/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1509 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001510 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 This function assumes that unicode can hold one more code point than wstr
1513 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001514static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
1518 const wchar_t *iter;
1519 Py_UCS4 *ucs4_out;
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 assert(unicode != NULL);
1522 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1524 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1525
1526 for (iter = begin; iter < end; ) {
1527 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1528 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001529 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1530 && (iter+1) < end
1531 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 {
Victor Stinner551ac952011-11-29 22:58:13 +01001533 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 iter += 2;
1535 }
1536 else {
1537 *ucs4_out++ = *iter;
1538 iter++;
1539 }
1540 }
1541 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
1543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544}
1545#endif
1546
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547static int
Victor Stinner488fa492011-12-12 00:01:39 +01001548unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001549{
Victor Stinner488fa492011-12-12 00:01:39 +01001550 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001551 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001552 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553 return -1;
1554 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001555 return 0;
1556}
1557
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558static int
1559_copy_characters(PyObject *to, Py_ssize_t to_start,
1560 PyObject *from, Py_ssize_t from_start,
1561 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001563 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001564 const void *from_data;
1565 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinneree4544c2012-05-09 22:24:08 +02001567 assert(0 <= how_many);
1568 assert(0 <= from_start);
1569 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerd3f08822012-05-29 12:57:52 +02001574 assert(PyUnicode_Check(to));
1575 assert(PyUnicode_IS_READY(to));
1576 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1577
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 if (how_many == 0)
1579 return 0;
1580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585
Victor Stinnerf1852262012-06-16 16:38:26 +02001586#ifdef Py_DEBUG
1587 if (!check_maxchar
1588 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1589 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 Py_UCS4 ch;
1592 Py_ssize_t i;
1593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1595 assert(ch <= to_maxchar);
1596 }
1597 }
1598#endif
1599
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001600 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (check_maxchar
1602 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1603 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 /* Writing Latin-1 characters into an ASCII string requires to
1605 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 Py_UCS4 max_char;
1607 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001608 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 if (max_char >= 128)
1610 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001611 }
Christian Heimesf051e432016-09-13 20:22:02 +02001612 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001614 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001616 else if (from_kind == PyUnicode_1BYTE_KIND
1617 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS1, Py_UCS2,
1621 PyUnicode_1BYTE_DATA(from) + from_start,
1622 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_2BYTE_DATA(to) + to_start
1624 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001625 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001626 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 && to_kind == PyUnicode_4BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS1, Py_UCS4,
1631 PyUnicode_1BYTE_DATA(from) + from_start,
1632 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_4BYTE_DATA(to) + to_start
1634 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001635 }
1636 else if (from_kind == PyUnicode_2BYTE_KIND
1637 && to_kind == PyUnicode_4BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS2, Py_UCS4,
1641 PyUnicode_2BYTE_DATA(from) + from_start,
1642 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_4BYTE_DATA(to) + to_start
1644 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001645 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1648
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 if (!check_maxchar) {
1650 if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_1BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS1,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_1BYTE_DATA(to) + to_start
1658 );
1659 }
1660 else if (from_kind == PyUnicode_4BYTE_KIND
1661 && to_kind == PyUnicode_1BYTE_KIND)
1662 {
1663 _PyUnicode_CONVERT_BYTES(
1664 Py_UCS4, Py_UCS1,
1665 PyUnicode_4BYTE_DATA(from) + from_start,
1666 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1667 PyUnicode_1BYTE_DATA(to) + to_start
1668 );
1669 }
1670 else if (from_kind == PyUnicode_4BYTE_KIND
1671 && to_kind == PyUnicode_2BYTE_KIND)
1672 {
1673 _PyUnicode_CONVERT_BYTES(
1674 Py_UCS4, Py_UCS2,
1675 PyUnicode_4BYTE_DATA(from) + from_start,
1676 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1677 PyUnicode_2BYTE_DATA(to) + to_start
1678 );
1679 }
1680 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001681 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001682 }
1683 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001684 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 Py_ssize_t i;
1688
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 for (i=0; i < how_many; i++) {
1690 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001691 if (ch > to_maxchar)
1692 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1694 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001695 }
1696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 return 0;
1698}
1699
Victor Stinnerd3f08822012-05-29 12:57:52 +02001700void
1701_PyUnicode_FastCopyCharacters(
1702 PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704{
1705 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1706}
1707
1708Py_ssize_t
1709PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1710 PyObject *from, Py_ssize_t from_start,
1711 Py_ssize_t how_many)
1712{
1713 int err;
1714
1715 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1716 PyErr_BadInternalCall();
1717 return -1;
1718 }
1719
Benjamin Petersonbac79492012-01-14 13:34:47 -05001720 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001722 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001723 return -1;
1724
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001726 PyErr_SetString(PyExc_IndexError, "string index out of range");
1727 return -1;
1728 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if (how_many < 0) {
1734 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1735 return -1;
1736 }
1737 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1739 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001740 "Cannot write %zi characters at %zi "
1741 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 how_many, to_start, PyUnicode_GET_LENGTH(to));
1743 return -1;
1744 }
1745
1746 if (how_many == 0)
1747 return 0;
1748
Victor Stinner488fa492011-12-12 00:01:39 +01001749 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001750 return -1;
1751
1752 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1753 if (err) {
1754 PyErr_Format(PyExc_SystemError,
1755 "Cannot copy %s characters "
1756 "into a string of %s characters",
1757 unicode_kind_name(from),
1758 unicode_kind_name(to));
1759 return -1;
1760 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001761 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Victor Stinner17222162011-09-28 22:15:37 +02001764/* Find the maximum code point and count the number of surrogate pairs so a
1765 correct string length can be computed before converting a string to UCS4.
1766 This function counts single surrogates as a character and not as a pair.
1767
1768 Return 0 on success, or -1 on error. */
1769static int
1770find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1771 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772{
1773 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 *num_surrogates = 0;
1778 *maxchar = 0;
1779
1780 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001782 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1783 && (iter+1) < end
1784 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1785 {
1786 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1787 ++(*num_surrogates);
1788 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 {
1793 ch = *iter;
1794 iter++;
1795 }
1796 if (ch > *maxchar) {
1797 *maxchar = ch;
1798 if (*maxchar > MAX_UNICODE) {
1799 PyErr_Format(PyExc_ValueError,
1800 "character U+%x is not in range [U+0000; U+10ffff]",
1801 ch);
1802 return -1;
1803 }
1804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 return 0;
1807}
1808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809int
1810_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811{
1812 wchar_t *end;
1813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815#if SIZEOF_WCHAR_T == 2
1816 Py_ssize_t length_wo_surrogates;
1817#endif
1818
Georg Brandl7597add2011-10-05 16:36:47 +02001819 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 strings were created using _PyObject_New() and where no canonical
1821 representation (the str field) has been set yet aka strings
1822 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001823 assert(_PyUnicode_CHECK(unicode));
1824 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001827 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001828 /* Actually, it should neither be interned nor be anything else: */
1829 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001832 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
1836 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1838 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 PyErr_NoMemory();
1840 return -1;
1841 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001842 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 _PyUnicode_WSTR(unicode), end,
1844 PyUnicode_1BYTE_DATA(unicode));
1845 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1847 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1848 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001849 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001850 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001851 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 }
1853 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861 }
1862 /* In this case we might have to convert down from 4-byte native
1863 wchar_t to 2-byte unicode. */
1864 else if (maxchar < 65536) {
1865 assert(num_surrogates == 0 &&
1866 "FindMaxCharAndNumSurrogatePairs() messed up");
1867
Victor Stinner506f5922011-09-28 22:34:18 +02001868#if SIZEOF_WCHAR_T == 2
1869 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876#else
1877 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001878 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001879 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001881 PyErr_NoMemory();
1882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinner506f5922011-09-28 22:34:18 +02001884 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1885 _PyUnicode_WSTR(unicode), end,
1886 PyUnicode_2BYTE_DATA(unicode));
1887 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1888 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1889 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001890 _PyUnicode_UTF8(unicode) = NULL;
1891 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001892 PyObject_FREE(_PyUnicode_WSTR(unicode));
1893 _PyUnicode_WSTR(unicode) = NULL;
1894 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
1897 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1898 else {
1899#if SIZEOF_WCHAR_T == 2
1900 /* in case the native representation is 2-bytes, we need to allocate a
1901 new normalized 4-byte version. */
1902 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001903 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1904 PyErr_NoMemory();
1905 return -1;
1906 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1908 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 PyErr_NoMemory();
1910 return -1;
1911 }
1912 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001916 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001918 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 PyObject_FREE(_PyUnicode_WSTR(unicode));
1920 _PyUnicode_WSTR(unicode) = NULL;
1921 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1922#else
1923 assert(num_surrogates == 0);
1924
Victor Stinnerc3c74152011-10-02 20:39:55 +02001925 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001927 _PyUnicode_UTF8(unicode) = NULL;
1928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1930#endif
1931 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1932 }
1933 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return 0;
1936}
1937
Alexander Belopolsky40018472011-02-26 01:02:56 +00001938static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001939unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940{
Walter Dörwald16807132007-05-25 13:52:07 +00001941 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 case SSTATE_NOT_INTERNED:
1943 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_INTERNED_MORTAL:
Victor Stinner607b1022020-05-05 18:50:30 +02001946#ifdef INTERNED_STRINGS
Victor Stinner3549ca32020-07-03 16:59:12 +02001947 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1948 references (key and value) which were ignored by
1949 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1950 to prevent calling unicode_dealloc() again. Adjust refcnt after
1951 PyDict_DelItem(). */
1952 assert(Py_REFCNT(unicode) == 0);
1953 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001954 if (PyDict_DelItem(interned, unicode) != 0) {
1955 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1956 NULL);
1957 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001958 assert(Py_REFCNT(unicode) == 1);
1959 Py_SET_REFCNT(unicode, 0);
Victor Stinner607b1022020-05-05 18:50:30 +02001960#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001961 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1965 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001966
Benjamin Peterson29060642009-01-31 22:14:21 +00001967 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001968 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001969 }
1970
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001971 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001973 }
1974 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001975 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001976 }
1977 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001978 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001981 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001984#ifdef Py_DEBUG
1985static int
1986unicode_is_singleton(PyObject *unicode)
1987{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001988 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001989 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001991 }
Victor Stinner607b1022020-05-05 18:50:30 +02001992 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001993 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1994 {
1995 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001996 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001997 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001998 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001999 }
2000 return 0;
2001}
2002#endif
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004static int
Victor Stinner488fa492011-12-12 00:01:39 +01002005unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002006{
Victor Stinner488fa492011-12-12 00:01:39 +01002007 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 if (Py_REFCNT(unicode) != 1)
2009 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002010 if (_PyUnicode_HASH(unicode) != -1)
2011 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012 if (PyUnicode_CHECK_INTERNED(unicode))
2013 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002014 if (!PyUnicode_CheckExact(unicode))
2015 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002016#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002017 /* singleton refcount is greater than 1 */
2018 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002019#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002020 return 1;
2021}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002022
Victor Stinnerfe226c02011-10-03 03:52:20 +02002023static int
2024unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2025{
2026 PyObject *unicode;
2027 Py_ssize_t old_length;
2028
2029 assert(p_unicode != NULL);
2030 unicode = *p_unicode;
2031
2032 assert(unicode != NULL);
2033 assert(PyUnicode_Check(unicode));
2034 assert(0 <= length);
2035
Victor Stinner910337b2011-10-03 03:20:16 +02002036 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002037 old_length = PyUnicode_WSTR_LENGTH(unicode);
2038 else
2039 old_length = PyUnicode_GET_LENGTH(unicode);
2040 if (old_length == length)
2041 return 0;
2042
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002043 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002044 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002045 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002046 return 0;
2047 }
2048
Victor Stinner488fa492011-12-12 00:01:39 +01002049 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 PyObject *copy = resize_copy(unicode, length);
2051 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002052 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002053 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002055 }
2056
Victor Stinnerfe226c02011-10-03 03:52:20 +02002057 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002058 PyObject *new_unicode = resize_compact(unicode, length);
2059 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002060 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002061 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002062 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002063 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002064 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002068PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002069{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 PyObject *unicode;
2071 if (p_unicode == NULL) {
2072 PyErr_BadInternalCall();
2073 return -1;
2074 }
2075 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002076 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002077 {
2078 PyErr_BadInternalCall();
2079 return -1;
2080 }
2081 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002082}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002083
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002084/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002085
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002086 WARNING: The function doesn't copy the terminating null character and
2087 doesn't check the maximum character (may write a latin1 character in an
2088 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002089static void
2090unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2091 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002092{
2093 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002094 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002095 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002096
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002097 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002098 switch (kind) {
2099 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002100#ifdef Py_DEBUG
2101 if (PyUnicode_IS_ASCII(unicode)) {
2102 Py_UCS4 maxchar = ucs1lib_find_max_char(
2103 (const Py_UCS1*)str,
2104 (const Py_UCS1*)str + len);
2105 assert(maxchar < 128);
2106 }
2107#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002108 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002109 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002110 }
2111 case PyUnicode_2BYTE_KIND: {
2112 Py_UCS2 *start = (Py_UCS2 *)data + index;
2113 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002114
Victor Stinner184252a2012-06-16 02:57:41 +02002115 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002116 *ucs2 = (Py_UCS2)*str;
2117
2118 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002119 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002120 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002121 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002122 Py_UCS4 *start = (Py_UCS4 *)data + index;
2123 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002124
Victor Stinner184252a2012-06-16 02:57:41 +02002125 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 *ucs4 = (Py_UCS4)*str;
2127
2128 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002129 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002130 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002131 default:
2132 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002133 }
2134}
2135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002137get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002139 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002140
Victor Stinner2f9ada92020-06-24 02:22:21 +02002141 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002142 if (unicode) {
2143 Py_INCREF(unicode);
2144 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 }
Victor Stinner607b1022020-05-05 18:50:30 +02002146
2147 unicode = PyUnicode_New(1, ch);
2148 if (!unicode) {
2149 return NULL;
2150 }
2151
2152 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2153 assert(_PyUnicode_CheckConsistency(unicode, 1));
2154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002156 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002157 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158}
2159
Victor Stinner985a82a2014-01-03 12:53:47 +01002160static PyObject*
2161unicode_char(Py_UCS4 ch)
2162{
2163 PyObject *unicode;
2164
2165 assert(ch <= MAX_UNICODE);
2166
Victor Stinner2f9ada92020-06-24 02:22:21 +02002167 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002168 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002169 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002170
Victor Stinner985a82a2014-01-03 12:53:47 +01002171 unicode = PyUnicode_New(1, ch);
2172 if (unicode == NULL)
2173 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002174
2175 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2176 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002177 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002178 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002179 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2180 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2181 }
2182 assert(_PyUnicode_CheckConsistency(unicode, 1));
2183 return unicode;
2184}
2185
Alexander Belopolsky40018472011-02-26 01:02:56 +00002186PyObject *
2187PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002189 if (u == NULL) {
2190 if (size > 0) {
2191 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2192 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2193 "use PyUnicode_New() instead", 1) < 0) {
2194 return NULL;
2195 }
2196 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002197 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002198 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002199
2200 if (size < 0) {
2201 PyErr_BadInternalCall();
2202 return NULL;
2203 }
2204
2205 return PyUnicode_FromWideChar(u, size);
2206}
2207
2208PyObject *
2209PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2210{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002211 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 Py_UCS4 maxchar = 0;
2213 Py_ssize_t num_surrogates;
2214
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002215 if (u == NULL && size != 0) {
2216 PyErr_BadInternalCall();
2217 return NULL;
2218 }
2219
2220 if (size == -1) {
2221 size = wcslen(u);
2222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002224 /* If the Unicode data is known at construction time, we can apply
2225 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002228 if (size == 0)
2229 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 /* Single character Unicode objects in the Latin-1 range are
2232 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002233 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return get_latin1_char((unsigned char)*u);
2235
2236 /* If not empty and not single character, copy the Unicode data
2237 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002238 if (find_maxchar_surrogates(u, u + size,
2239 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 return NULL;
2241
Victor Stinner8faf8212011-12-08 22:14:11 +01002242 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 if (!unicode)
2244 return NULL;
2245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 switch (PyUnicode_KIND(unicode)) {
2247 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002248 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2250 break;
2251 case PyUnicode_2BYTE_KIND:
2252#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002253 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002255 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2257#endif
2258 break;
2259 case PyUnicode_4BYTE_KIND:
2260#if SIZEOF_WCHAR_T == 2
2261 /* This is the only case which has to process surrogates, thus
2262 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002263 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264#else
2265 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002266 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267#endif
2268 break;
2269 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002270 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002273 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274}
2275
Alexander Belopolsky40018472011-02-26 01:02:56 +00002276PyObject *
2277PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002278{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002279 if (size < 0) {
2280 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 return NULL;
2283 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002284 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002285 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002286 }
2287 else {
2288 if (size > 0) {
2289 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2290 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2291 "use PyUnicode_New() instead", 1) < 0) {
2292 return NULL;
2293 }
2294 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002295 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002296 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002297}
2298
Alexander Belopolsky40018472011-02-26 01:02:56 +00002299PyObject *
2300PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002301{
2302 size_t size = strlen(u);
2303 if (size > PY_SSIZE_T_MAX) {
2304 PyErr_SetString(PyExc_OverflowError, "input too long");
2305 return NULL;
2306 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002307 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308}
2309
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002310PyObject *
2311_PyUnicode_FromId(_Py_Identifier *id)
2312{
Victor Stinner297257f2020-06-02 14:39:45 +02002313 if (id->object) {
2314 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002315 }
Victor Stinner297257f2020-06-02 14:39:45 +02002316
2317 PyObject *obj;
2318 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2319 strlen(id->string),
2320 NULL, NULL);
2321 if (!obj) {
2322 return NULL;
2323 }
2324 PyUnicode_InternInPlace(&obj);
2325
2326 assert(!id->next);
2327 id->object = obj;
2328 id->next = static_strings;
2329 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002330 return id->object;
2331}
2332
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002333static void
2334unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002335{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002336 _Py_Identifier *tmp, *s = static_strings;
2337 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002338 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002339 tmp = s->next;
2340 s->next = NULL;
2341 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002342 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002343 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002344}
2345
Benjamin Peterson0df54292012-03-26 14:50:32 -04002346/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002347
Victor Stinnerd3f08822012-05-29 12:57:52 +02002348PyObject*
2349_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002350{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002351 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002352 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002353 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002354#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002355 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002356#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002357 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002358 }
Victor Stinner785938e2011-12-11 20:09:03 +01002359 unicode = PyUnicode_New(size, 127);
2360 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002361 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002362 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2363 assert(_PyUnicode_CheckConsistency(unicode, 1));
2364 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002365}
2366
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002367static Py_UCS4
2368kind_maxchar_limit(unsigned int kind)
2369{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002370 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002371 case PyUnicode_1BYTE_KIND:
2372 return 0x80;
2373 case PyUnicode_2BYTE_KIND:
2374 return 0x100;
2375 case PyUnicode_4BYTE_KIND:
2376 return 0x10000;
2377 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002378 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002379 }
2380}
2381
Victor Stinner702c7342011-10-05 13:50:52 +02002382static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002383_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002386 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002387
Victor Stinner2f9ada92020-06-24 02:22:21 +02002388 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002389 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002390 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002391 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002392 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002393 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002394 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002395
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002396 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 if (!res)
2399 return NULL;
2400 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002401 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002403}
2404
Victor Stinnere57b1c02011-09-28 22:20:48 +02002405static PyObject*
2406_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407{
2408 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002409 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002410
Serhiy Storchaka678db842013-01-26 12:16:36 +02002411 if (size == 0)
2412 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002413 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002414 if (size == 1)
2415 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002416
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002417 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002418 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 if (!res)
2420 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002421 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 else {
2424 _PyUnicode_CONVERT_BYTES(
2425 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2426 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002427 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 return res;
2429}
2430
Victor Stinnere57b1c02011-09-28 22:20:48 +02002431static PyObject*
2432_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433{
2434 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436
Serhiy Storchaka678db842013-01-26 12:16:36 +02002437 if (size == 0)
2438 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002439 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002440 if (size == 1)
2441 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002442
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002443 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002444 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 if (!res)
2446 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (max_char < 256)
2448 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2449 PyUnicode_1BYTE_DATA(res));
2450 else if (max_char < 0x10000)
2451 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2452 PyUnicode_2BYTE_DATA(res));
2453 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002455 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 return res;
2457}
2458
2459PyObject*
2460PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2461{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002462 if (size < 0) {
2463 PyErr_SetString(PyExc_ValueError, "size must be positive");
2464 return NULL;
2465 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002466 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002468 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002470 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002472 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002473 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002474 PyErr_SetString(PyExc_SystemError, "invalid kind");
2475 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477}
2478
Victor Stinnerece58de2012-04-23 23:36:38 +02002479Py_UCS4
2480_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2481{
2482 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002483 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002484
2485 assert(PyUnicode_IS_READY(unicode));
2486 assert(0 <= start);
2487 assert(end <= PyUnicode_GET_LENGTH(unicode));
2488 assert(start <= end);
2489
2490 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2491 return PyUnicode_MAX_CHAR_VALUE(unicode);
2492
2493 if (start == end)
2494 return 127;
2495
Victor Stinner94d558b2012-04-27 22:26:58 +02002496 if (PyUnicode_IS_ASCII(unicode))
2497 return 127;
2498
Victor Stinnerece58de2012-04-23 23:36:38 +02002499 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002500 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002501 endptr = (char *)startptr + end * kind;
2502 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002503 switch(kind) {
2504 case PyUnicode_1BYTE_KIND:
2505 return ucs1lib_find_max_char(startptr, endptr);
2506 case PyUnicode_2BYTE_KIND:
2507 return ucs2lib_find_max_char(startptr, endptr);
2508 case PyUnicode_4BYTE_KIND:
2509 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002510 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002511 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002512 }
2513}
2514
Victor Stinner25a4b292011-10-06 12:31:55 +02002515/* Ensure that a string uses the most efficient storage, if it is not the
2516 case: create a new string with of the right kind. Write NULL into *p_unicode
2517 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002518static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002519unicode_adjust_maxchar(PyObject **p_unicode)
2520{
2521 PyObject *unicode, *copy;
2522 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002523 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002524 unsigned int kind;
2525
2526 assert(p_unicode != NULL);
2527 unicode = *p_unicode;
2528 assert(PyUnicode_IS_READY(unicode));
2529 if (PyUnicode_IS_ASCII(unicode))
2530 return;
2531
2532 len = PyUnicode_GET_LENGTH(unicode);
2533 kind = PyUnicode_KIND(unicode);
2534 if (kind == PyUnicode_1BYTE_KIND) {
2535 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002536 max_char = ucs1lib_find_max_char(u, u + len);
2537 if (max_char >= 128)
2538 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002539 }
2540 else if (kind == PyUnicode_2BYTE_KIND) {
2541 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002542 max_char = ucs2lib_find_max_char(u, u + len);
2543 if (max_char >= 256)
2544 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002545 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002546 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002547 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002548 max_char = ucs4lib_find_max_char(u, u + len);
2549 if (max_char >= 0x10000)
2550 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002551 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002552 else
2553 Py_UNREACHABLE();
2554
Victor Stinner25a4b292011-10-06 12:31:55 +02002555 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002556 if (copy != NULL)
2557 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002558 Py_DECREF(unicode);
2559 *p_unicode = copy;
2560}
2561
Victor Stinner034f6cf2011-09-30 02:26:44 +02002562PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002563_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002564{
Victor Stinner87af4f22011-11-21 23:03:47 +01002565 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002566 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002567
Victor Stinner034f6cf2011-09-30 02:26:44 +02002568 if (!PyUnicode_Check(unicode)) {
2569 PyErr_BadInternalCall();
2570 return NULL;
2571 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002572 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002573 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002574
Victor Stinner87af4f22011-11-21 23:03:47 +01002575 length = PyUnicode_GET_LENGTH(unicode);
2576 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002577 if (!copy)
2578 return NULL;
2579 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2580
Christian Heimesf051e432016-09-13 20:22:02 +02002581 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002582 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002583 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002584 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002585}
2586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587
Victor Stinnerbc603d12011-10-02 01:00:40 +02002588/* Widen Unicode objects to larger buffers. Don't write terminating null
2589 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002591static void*
2592unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002594 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002595
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002596 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002597 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002598 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002599 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600 if (!result)
2601 return PyErr_NoMemory();
2602 assert(skind == PyUnicode_1BYTE_KIND);
2603 _PyUnicode_CONVERT_BYTES(
2604 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002605 (const Py_UCS1 *)data,
2606 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002607 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002609 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002610 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002611 if (!result)
2612 return PyErr_NoMemory();
2613 if (skind == PyUnicode_2BYTE_KIND) {
2614 _PyUnicode_CONVERT_BYTES(
2615 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002616 (const Py_UCS2 *)data,
2617 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002618 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002620 else {
2621 assert(skind == PyUnicode_1BYTE_KIND);
2622 _PyUnicode_CONVERT_BYTES(
2623 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002624 (const Py_UCS1 *)data,
2625 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002626 result);
2627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002629 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002630 Py_UNREACHABLE();
2631 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633}
2634
2635static Py_UCS4*
2636as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2637 int copy_null)
2638{
2639 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002640 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 Py_ssize_t len, targetlen;
2642 if (PyUnicode_READY(string) == -1)
2643 return NULL;
2644 kind = PyUnicode_KIND(string);
2645 data = PyUnicode_DATA(string);
2646 len = PyUnicode_GET_LENGTH(string);
2647 targetlen = len;
2648 if (copy_null)
2649 targetlen++;
2650 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002651 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 if (!target) {
2653 PyErr_NoMemory();
2654 return NULL;
2655 }
2656 }
2657 else {
2658 if (targetsize < targetlen) {
2659 PyErr_Format(PyExc_SystemError,
2660 "string is longer than the buffer");
2661 if (copy_null && 0 < targetsize)
2662 target[0] = 0;
2663 return NULL;
2664 }
2665 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002666 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002667 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002668 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002670 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002671 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002672 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2673 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002674 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002675 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002676 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002677 else {
2678 Py_UNREACHABLE();
2679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 if (copy_null)
2681 target[len] = 0;
2682 return target;
2683}
2684
2685Py_UCS4*
2686PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2687 int copy_null)
2688{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002689 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 PyErr_BadInternalCall();
2691 return NULL;
2692 }
2693 return as_ucs4(string, target, targetsize, copy_null);
2694}
2695
2696Py_UCS4*
2697PyUnicode_AsUCS4Copy(PyObject *string)
2698{
2699 return as_ucs4(string, NULL, 0, 1);
2700}
2701
Victor Stinner15a11362012-10-06 23:48:20 +02002702/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002703 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2704 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2705#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002706
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707static int
2708unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2709 Py_ssize_t width, Py_ssize_t precision)
2710{
2711 Py_ssize_t length, fill, arglen;
2712 Py_UCS4 maxchar;
2713
2714 if (PyUnicode_READY(str) == -1)
2715 return -1;
2716
2717 length = PyUnicode_GET_LENGTH(str);
2718 if ((precision == -1 || precision >= length)
2719 && width <= length)
2720 return _PyUnicodeWriter_WriteStr(writer, str);
2721
2722 if (precision != -1)
2723 length = Py_MIN(precision, length);
2724
2725 arglen = Py_MAX(length, width);
2726 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2727 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2728 else
2729 maxchar = writer->maxchar;
2730
2731 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2732 return -1;
2733
2734 if (width > length) {
2735 fill = width - length;
2736 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2737 return -1;
2738 writer->pos += fill;
2739 }
2740
2741 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2742 str, 0, length);
2743 writer->pos += length;
2744 return 0;
2745}
2746
2747static int
Victor Stinner998b8062018-09-12 00:23:25 +02002748unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002749 Py_ssize_t width, Py_ssize_t precision)
2750{
2751 /* UTF-8 */
2752 Py_ssize_t length;
2753 PyObject *unicode;
2754 int res;
2755
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002756 if (precision == -1) {
2757 length = strlen(str);
2758 }
2759 else {
2760 length = 0;
2761 while (length < precision && str[length]) {
2762 length++;
2763 }
2764 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2766 if (unicode == NULL)
2767 return -1;
2768
2769 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2770 Py_DECREF(unicode);
2771 return res;
2772}
2773
Victor Stinner96865452011-03-01 23:44:09 +00002774static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002775unicode_fromformat_arg(_PyUnicodeWriter *writer,
2776 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002777{
Victor Stinnere215d962012-10-06 23:03:36 +02002778 const char *p;
2779 Py_ssize_t len;
2780 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002781 Py_ssize_t width;
2782 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002783 int longflag;
2784 int longlongflag;
2785 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002787
2788 p = f;
2789 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002790 zeropad = 0;
2791 if (*f == '0') {
2792 zeropad = 1;
2793 f++;
2794 }
Victor Stinner96865452011-03-01 23:44:09 +00002795
2796 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002797 width = -1;
2798 if (Py_ISDIGIT((unsigned)*f)) {
2799 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002800 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002801 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002802 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002803 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002805 return NULL;
2806 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002808 f++;
2809 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 }
2811 precision = -1;
2812 if (*f == '.') {
2813 f++;
2814 if (Py_ISDIGIT((unsigned)*f)) {
2815 precision = (*f - '0');
2816 f++;
2817 while (Py_ISDIGIT((unsigned)*f)) {
2818 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2819 PyErr_SetString(PyExc_ValueError,
2820 "precision too big");
2821 return NULL;
2822 }
2823 precision = (precision * 10) + (*f - '0');
2824 f++;
2825 }
2826 }
Victor Stinner96865452011-03-01 23:44:09 +00002827 if (*f == '%') {
2828 /* "%.3%s" => f points to "3" */
2829 f--;
2830 }
2831 }
2832 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002833 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002834 f--;
2835 }
Victor Stinner96865452011-03-01 23:44:09 +00002836
2837 /* Handle %ld, %lu, %lld and %llu. */
2838 longflag = 0;
2839 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002840 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002841 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002842 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002843 longflag = 1;
2844 ++f;
2845 }
Victor Stinner96865452011-03-01 23:44:09 +00002846 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002847 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002848 longlongflag = 1;
2849 f += 2;
2850 }
Victor Stinner96865452011-03-01 23:44:09 +00002851 }
2852 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002853 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002854 size_tflag = 1;
2855 ++f;
2856 }
Victor Stinnere215d962012-10-06 23:03:36 +02002857
2858 if (f[1] == '\0')
2859 writer->overallocate = 0;
2860
2861 switch (*f) {
2862 case 'c':
2863 {
2864 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002865 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002866 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002867 "character argument not in range(0x110000)");
2868 return NULL;
2869 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002870 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002871 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002872 break;
2873 }
2874
2875 case 'i':
2876 case 'd':
2877 case 'u':
2878 case 'x':
2879 {
2880 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002881 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002882 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002885 if (longflag) {
2886 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2887 }
2888 else if (longlongflag) {
2889 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2890 }
2891 else if (size_tflag) {
2892 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2893 }
2894 else {
2895 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2896 }
Victor Stinnere215d962012-10-06 23:03:36 +02002897 }
2898 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002899 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002900 }
2901 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002902 if (longflag) {
2903 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2904 }
2905 else if (longlongflag) {
2906 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2907 }
2908 else if (size_tflag) {
2909 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2910 }
2911 else {
2912 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2913 }
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 assert(len >= 0);
2916
Victor Stinnere215d962012-10-06 23:03:36 +02002917 if (precision < len)
2918 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002919
2920 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002921 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2922 return NULL;
2923
Victor Stinnere215d962012-10-06 23:03:36 +02002924 if (width > precision) {
2925 Py_UCS4 fillchar;
2926 fill = width - precision;
2927 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002928 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2929 return NULL;
2930 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002931 }
Victor Stinner15a11362012-10-06 23:48:20 +02002932 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002933 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002934 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2935 return NULL;
2936 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002937 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002938
Victor Stinner4a587072013-11-19 12:54:53 +01002939 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2940 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002941 break;
2942 }
2943
2944 case 'p':
2945 {
2946 char number[MAX_LONG_LONG_CHARS];
2947
2948 len = sprintf(number, "%p", va_arg(*vargs, void*));
2949 assert(len >= 0);
2950
2951 /* %p is ill-defined: ensure leading 0x. */
2952 if (number[1] == 'X')
2953 number[1] = 'x';
2954 else if (number[1] != 'x') {
2955 memmove(number + 2, number,
2956 strlen(number) + 1);
2957 number[0] = '0';
2958 number[1] = 'x';
2959 len += 2;
2960 }
2961
Victor Stinner4a587072013-11-19 12:54:53 +01002962 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002963 return NULL;
2964 break;
2965 }
2966
2967 case 's':
2968 {
2969 /* UTF-8 */
2970 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002971 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002972 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002973 break;
2974 }
2975
2976 case 'U':
2977 {
2978 PyObject *obj = va_arg(*vargs, PyObject *);
2979 assert(obj && _PyUnicode_CHECK(obj));
2980
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002981 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002982 return NULL;
2983 break;
2984 }
2985
2986 case 'V':
2987 {
2988 PyObject *obj = va_arg(*vargs, PyObject *);
2989 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002990 if (obj) {
2991 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002992 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002993 return NULL;
2994 }
2995 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002996 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002997 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002998 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002999 }
3000 break;
3001 }
3002
3003 case 'S':
3004 {
3005 PyObject *obj = va_arg(*vargs, PyObject *);
3006 PyObject *str;
3007 assert(obj);
3008 str = PyObject_Str(obj);
3009 if (!str)
3010 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003011 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003012 Py_DECREF(str);
3013 return NULL;
3014 }
3015 Py_DECREF(str);
3016 break;
3017 }
3018
3019 case 'R':
3020 {
3021 PyObject *obj = va_arg(*vargs, PyObject *);
3022 PyObject *repr;
3023 assert(obj);
3024 repr = PyObject_Repr(obj);
3025 if (!repr)
3026 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003027 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003028 Py_DECREF(repr);
3029 return NULL;
3030 }
3031 Py_DECREF(repr);
3032 break;
3033 }
3034
3035 case 'A':
3036 {
3037 PyObject *obj = va_arg(*vargs, PyObject *);
3038 PyObject *ascii;
3039 assert(obj);
3040 ascii = PyObject_ASCII(obj);
3041 if (!ascii)
3042 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003043 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003044 Py_DECREF(ascii);
3045 return NULL;
3046 }
3047 Py_DECREF(ascii);
3048 break;
3049 }
3050
3051 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003052 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003053 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003054 break;
3055
3056 default:
3057 /* if we stumble upon an unknown formatting code, copy the rest
3058 of the format string to the output string. (we cannot just
3059 skip the code, since there's no way to know what's in the
3060 argument list) */
3061 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003062 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003063 return NULL;
3064 f = p+len;
3065 return f;
3066 }
3067
3068 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003069 return f;
3070}
3071
Walter Dörwaldd2034312007-05-18 16:29:38 +00003072PyObject *
3073PyUnicode_FromFormatV(const char *format, va_list vargs)
3074{
Victor Stinnere215d962012-10-06 23:03:36 +02003075 va_list vargs2;
3076 const char *f;
3077 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003078
Victor Stinner8f674cc2013-04-17 23:02:17 +02003079 _PyUnicodeWriter_Init(&writer);
3080 writer.min_length = strlen(format) + 100;
3081 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003082
Benjamin Peterson0c212142016-09-20 20:39:33 -07003083 // Copy varags to be able to pass a reference to a subfunction.
3084 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003085
3086 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003088 f = unicode_fromformat_arg(&writer, f, &vargs2);
3089 if (f == NULL)
3090 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 const char *p;
3094 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003095
Victor Stinnere215d962012-10-06 23:03:36 +02003096 p = f;
3097 do
3098 {
3099 if ((unsigned char)*p > 127) {
3100 PyErr_Format(PyExc_ValueError,
3101 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3102 "string, got a non-ASCII byte: 0x%02x",
3103 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003104 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003105 }
3106 p++;
3107 }
3108 while (*p != '\0' && *p != '%');
3109 len = p - f;
3110
3111 if (*p == '\0')
3112 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003113
3114 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003115 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003116
3117 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003119 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003120 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003121 return _PyUnicodeWriter_Finish(&writer);
3122
3123 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003124 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003125 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003126 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127}
3128
Walter Dörwaldd2034312007-05-18 16:29:38 +00003129PyObject *
3130PyUnicode_FromFormat(const char *format, ...)
3131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003132 PyObject* ret;
3133 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003134
3135#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003137#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003138 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003139#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003140 ret = PyUnicode_FromFormatV(format, vargs);
3141 va_end(vargs);
3142 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003143}
3144
Serhiy Storchakac46db922018-10-23 22:58:24 +03003145static Py_ssize_t
3146unicode_get_widechar_size(PyObject *unicode)
3147{
3148 Py_ssize_t res;
3149
3150 assert(unicode != NULL);
3151 assert(_PyUnicode_CHECK(unicode));
3152
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003153#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003154 if (_PyUnicode_WSTR(unicode) != NULL) {
3155 return PyUnicode_WSTR_LENGTH(unicode);
3156 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003157#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003158 assert(PyUnicode_IS_READY(unicode));
3159
3160 res = _PyUnicode_LENGTH(unicode);
3161#if SIZEOF_WCHAR_T == 2
3162 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3163 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3164 const Py_UCS4 *end = s + res;
3165 for (; s < end; ++s) {
3166 if (*s > 0xFFFF) {
3167 ++res;
3168 }
3169 }
3170 }
3171#endif
3172 return res;
3173}
3174
3175static void
3176unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3177{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003178 assert(unicode != NULL);
3179 assert(_PyUnicode_CHECK(unicode));
3180
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003181#if USE_UNICODE_WCHAR_CACHE
3182 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003183 if (wstr != NULL) {
3184 memcpy(w, wstr, size * sizeof(wchar_t));
3185 return;
3186 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003187#else /* USE_UNICODE_WCHAR_CACHE */
3188 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3189 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3190 return;
3191 }
3192#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003193 assert(PyUnicode_IS_READY(unicode));
3194
3195 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3196 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3197 for (; size--; ++s, ++w) {
3198 *w = *s;
3199 }
3200 }
3201 else {
3202#if SIZEOF_WCHAR_T == 4
3203 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3204 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3205 for (; size--; ++s, ++w) {
3206 *w = *s;
3207 }
3208#else
3209 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3210 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3211 for (; size--; ++s, ++w) {
3212 Py_UCS4 ch = *s;
3213 if (ch > 0xFFFF) {
3214 assert(ch <= MAX_UNICODE);
3215 /* encode surrogate pair in this case */
3216 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3217 if (!size--)
3218 break;
3219 *w = Py_UNICODE_LOW_SURROGATE(ch);
3220 }
3221 else {
3222 *w = ch;
3223 }
3224 }
3225#endif
3226 }
3227}
3228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003229#ifdef HAVE_WCHAR_H
3230
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003231/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003232
Victor Stinnerd88d9832011-09-06 02:00:05 +02003233 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003234 character) required to convert the unicode object. Ignore size argument.
3235
Victor Stinnerd88d9832011-09-06 02:00:05 +02003236 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003237 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003238 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003239Py_ssize_t
3240PyUnicode_AsWideChar(PyObject *unicode,
3241 wchar_t *w,
3242 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003243{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003244 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003245
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003246 if (unicode == NULL) {
3247 PyErr_BadInternalCall();
3248 return -1;
3249 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003252 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003253 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003254
3255 res = unicode_get_widechar_size(unicode);
3256 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003257 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003258 }
3259
3260 if (size > res) {
3261 size = res + 1;
3262 }
3263 else {
3264 res = size;
3265 }
3266 unicode_copy_as_widechar(unicode, w, size);
3267 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003268}
3269
Victor Stinner137c34c2010-09-29 10:25:54 +00003270wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003271PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003272 Py_ssize_t *size)
3273{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003274 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003275 Py_ssize_t buflen;
3276
3277 if (unicode == NULL) {
3278 PyErr_BadInternalCall();
3279 return NULL;
3280 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003281 if (!PyUnicode_Check(unicode)) {
3282 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003283 return NULL;
3284 }
3285
Serhiy Storchakac46db922018-10-23 22:58:24 +03003286 buflen = unicode_get_widechar_size(unicode);
3287 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003288 if (buffer == NULL) {
3289 PyErr_NoMemory();
3290 return NULL;
3291 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003292 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3293 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003294 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003295 }
3296 else if (wcslen(buffer) != (size_t)buflen) {
3297 PyMem_FREE(buffer);
3298 PyErr_SetString(PyExc_ValueError,
3299 "embedded null character");
3300 return NULL;
3301 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003302 return buffer;
3303}
3304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003305#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003307int
3308_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3309{
3310 wchar_t **p = (wchar_t **)ptr;
3311 if (obj == NULL) {
3312#if !USE_UNICODE_WCHAR_CACHE
3313 PyMem_Free(*p);
3314#endif /* USE_UNICODE_WCHAR_CACHE */
3315 *p = NULL;
3316 return 1;
3317 }
3318 if (PyUnicode_Check(obj)) {
3319#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003320 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3321 if (*p == NULL) {
3322 return 0;
3323 }
3324 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003325#else /* USE_UNICODE_WCHAR_CACHE */
3326 *p = PyUnicode_AsWideCharString(obj, NULL);
3327 if (*p == NULL) {
3328 return 0;
3329 }
3330 return Py_CLEANUP_SUPPORTED;
3331#endif /* USE_UNICODE_WCHAR_CACHE */
3332 }
3333 PyErr_Format(PyExc_TypeError,
3334 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003335 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003336 return 0;
3337}
3338
3339int
3340_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3341{
3342 wchar_t **p = (wchar_t **)ptr;
3343 if (obj == NULL) {
3344#if !USE_UNICODE_WCHAR_CACHE
3345 PyMem_Free(*p);
3346#endif /* USE_UNICODE_WCHAR_CACHE */
3347 *p = NULL;
3348 return 1;
3349 }
3350 if (obj == Py_None) {
3351 *p = NULL;
3352 return 1;
3353 }
3354 if (PyUnicode_Check(obj)) {
3355#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003356 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3357 if (*p == NULL) {
3358 return 0;
3359 }
3360 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003361#else /* USE_UNICODE_WCHAR_CACHE */
3362 *p = PyUnicode_AsWideCharString(obj, NULL);
3363 if (*p == NULL) {
3364 return 0;
3365 }
3366 return Py_CLEANUP_SUPPORTED;
3367#endif /* USE_UNICODE_WCHAR_CACHE */
3368 }
3369 PyErr_Format(PyExc_TypeError,
3370 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003371 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003372 return 0;
3373}
3374
Alexander Belopolsky40018472011-02-26 01:02:56 +00003375PyObject *
3376PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003377{
Victor Stinner8faf8212011-12-08 22:14:11 +01003378 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003379 PyErr_SetString(PyExc_ValueError,
3380 "chr() arg not in range(0x110000)");
3381 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003382 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003383
Victor Stinner985a82a2014-01-03 12:53:47 +01003384 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003385}
3386
Alexander Belopolsky40018472011-02-26 01:02:56 +00003387PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003388PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003390 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003392 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003393 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003394 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 Py_INCREF(obj);
3396 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003397 }
3398 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 /* For a Unicode subtype that's not a Unicode object,
3400 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003401 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003402 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003403 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003404 "Can't convert '%.100s' object to str implicitly",
3405 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003406 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003407}
3408
Alexander Belopolsky40018472011-02-26 01:02:56 +00003409PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003410PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003411 const char *encoding,
3412 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003413{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003414 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003415 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003416
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003418 PyErr_BadInternalCall();
3419 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003421
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003422 /* Decoding bytes objects is the most common case and should be fast */
3423 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003424 if (PyBytes_GET_SIZE(obj) == 0) {
3425 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3426 return NULL;
3427 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003428 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003429 }
3430 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003431 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3432 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003433 }
3434
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003435 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003436 PyErr_SetString(PyExc_TypeError,
3437 "decoding str is not supported");
3438 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003440
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003441 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3442 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3443 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003444 "decoding to str: need a bytes-like object, %.80s found",
3445 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003446 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003447 }
Tim Petersced69f82003-09-16 20:30:58 +00003448
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003449 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003450 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003451 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3452 return NULL;
3453 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003454 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003456
Serhiy Storchaka05997252013-01-26 12:14:02 +02003457 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003458 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003459 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460}
3461
Victor Stinnerebe17e02016-10-12 13:57:45 +02003462/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3463 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3464 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003465int
3466_Py_normalize_encoding(const char *encoding,
3467 char *lower,
3468 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003470 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003471 char *l;
3472 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003473 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474
Victor Stinner942889a2016-09-05 15:40:10 -07003475 assert(encoding != NULL);
3476
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003477 e = encoding;
3478 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003479 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003480 punct = 0;
3481 while (1) {
3482 char c = *e;
3483 if (c == 0) {
3484 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003485 }
Victor Stinner942889a2016-09-05 15:40:10 -07003486
3487 if (Py_ISALNUM(c) || c == '.') {
3488 if (punct && l != lower) {
3489 if (l == l_end) {
3490 return 0;
3491 }
3492 *l++ = '_';
3493 }
3494 punct = 0;
3495
3496 if (l == l_end) {
3497 return 0;
3498 }
3499 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003500 }
3501 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003502 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003503 }
Victor Stinner942889a2016-09-05 15:40:10 -07003504
3505 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003506 }
3507 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003508 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003509}
3510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511PyObject *
3512PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 Py_ssize_t size,
3514 const char *encoding,
3515 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003516{
3517 PyObject *buffer = NULL, *unicode;
3518 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003519 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3520
Victor Stinner22eb6892019-06-26 00:51:05 +02003521 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3522 return NULL;
3523 }
3524
Victor Stinnered076ed2019-06-26 01:49:32 +02003525 if (size == 0) {
3526 _Py_RETURN_UNICODE_EMPTY();
3527 }
3528
Victor Stinner942889a2016-09-05 15:40:10 -07003529 if (encoding == NULL) {
3530 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3531 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003532
Fred Drakee4315f52000-05-09 19:53:39 +00003533 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003534 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3535 char *lower = buflower;
3536
3537 /* Fast paths */
3538 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3539 lower += 3;
3540 if (*lower == '_') {
3541 /* Match "utf8" and "utf_8" */
3542 lower++;
3543 }
3544
3545 if (lower[0] == '8' && lower[1] == 0) {
3546 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3547 }
3548 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3549 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3550 }
3551 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3552 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3553 }
3554 }
3555 else {
3556 if (strcmp(lower, "ascii") == 0
3557 || strcmp(lower, "us_ascii") == 0) {
3558 return PyUnicode_DecodeASCII(s, size, errors);
3559 }
Steve Dowercc16be82016-09-08 10:35:16 -07003560 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003561 else if (strcmp(lower, "mbcs") == 0) {
3562 return PyUnicode_DecodeMBCS(s, size, errors);
3563 }
3564 #endif
3565 else if (strcmp(lower, "latin1") == 0
3566 || strcmp(lower, "latin_1") == 0
3567 || strcmp(lower, "iso_8859_1") == 0
3568 || strcmp(lower, "iso8859_1") == 0) {
3569 return PyUnicode_DecodeLatin1(s, size, errors);
3570 }
3571 }
Victor Stinner37296e82010-06-10 13:36:23 +00003572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573
3574 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003575 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003576 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003577 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003578 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 if (buffer == NULL)
3580 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003581 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582 if (unicode == NULL)
3583 goto onError;
3584 if (!PyUnicode_Check(unicode)) {
3585 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003586 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003587 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003588 encoding,
3589 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 Py_DECREF(unicode);
3591 goto onError;
3592 }
3593 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003594 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003595
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 Py_XDECREF(buffer);
3598 return NULL;
3599}
3600
Alexander Belopolsky40018472011-02-26 01:02:56 +00003601PyObject *
3602PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003603 const char *encoding,
3604 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003605{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003606 if (!PyUnicode_Check(unicode)) {
3607 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003608 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003609 }
3610
Serhiy Storchaka00939072016-10-27 21:05:49 +03003611 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3612 "PyUnicode_AsDecodedObject() is deprecated; "
3613 "use PyCodec_Decode() to decode from str", 1) < 0)
3614 return NULL;
3615
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003616 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003618
3619 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003620 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003621}
3622
Alexander Belopolsky40018472011-02-26 01:02:56 +00003623PyObject *
3624PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003625 const char *encoding,
3626 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627{
3628 PyObject *v;
3629
3630 if (!PyUnicode_Check(unicode)) {
3631 PyErr_BadArgument();
3632 goto onError;
3633 }
3634
Serhiy Storchaka00939072016-10-27 21:05:49 +03003635 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3636 "PyUnicode_AsDecodedUnicode() is deprecated; "
3637 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3638 return NULL;
3639
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003640 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003642
3643 /* Decode via the codec registry */
3644 v = PyCodec_Decode(unicode, encoding, errors);
3645 if (v == NULL)
3646 goto onError;
3647 if (!PyUnicode_Check(v)) {
3648 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003649 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003650 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003651 encoding,
3652 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003653 Py_DECREF(v);
3654 goto onError;
3655 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003656 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003657
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659 return NULL;
3660}
3661
Alexander Belopolsky40018472011-02-26 01:02:56 +00003662PyObject *
3663PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003664 Py_ssize_t size,
3665 const char *encoding,
3666 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667{
3668 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003669
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003670 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3674 Py_DECREF(unicode);
3675 return v;
3676}
3677
Alexander Belopolsky40018472011-02-26 01:02:56 +00003678PyObject *
3679PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003680 const char *encoding,
3681 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003682{
3683 PyObject *v;
3684
3685 if (!PyUnicode_Check(unicode)) {
3686 PyErr_BadArgument();
3687 goto onError;
3688 }
3689
Serhiy Storchaka00939072016-10-27 21:05:49 +03003690 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3691 "PyUnicode_AsEncodedObject() is deprecated; "
3692 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3693 "or PyCodec_Encode() for generic encoding", 1) < 0)
3694 return NULL;
3695
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003696 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003698
3699 /* Encode via the codec registry */
3700 v = PyCodec_Encode(unicode, encoding, errors);
3701 if (v == NULL)
3702 goto onError;
3703 return v;
3704
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003706 return NULL;
3707}
3708
Victor Stinner1b579672011-12-17 05:47:23 +01003709
Victor Stinner2cba6b82018-01-10 22:46:15 +01003710static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003711unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003712 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003713{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003714 Py_ssize_t wlen;
3715 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3716 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003717 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003719
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003720 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003721 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003722 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003723 return NULL;
3724 }
3725
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003726 char *str;
3727 size_t error_pos;
3728 const char *reason;
3729 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003730 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003731 PyMem_Free(wstr);
3732
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003733 if (res != 0) {
3734 if (res == -2) {
3735 PyObject *exc;
3736 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3737 "locale", unicode,
3738 (Py_ssize_t)error_pos,
3739 (Py_ssize_t)(error_pos+1),
3740 reason);
3741 if (exc != NULL) {
3742 PyCodec_StrictErrors(exc);
3743 Py_DECREF(exc);
3744 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003745 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003746 else if (res == -3) {
3747 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3748 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003749 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003750 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003751 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003752 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003753 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003754
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003755 PyObject *bytes = PyBytes_FromString(str);
3756 PyMem_RawFree(str);
3757 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003758}
3759
Victor Stinnerad158722010-10-27 00:25:46 +00003760PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003761PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3762{
Victor Stinner709d23d2019-05-02 14:56:30 -04003763 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3764 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003765}
3766
3767PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003768PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003769{
Victor Stinner81a7be32020-04-14 15:14:01 +02003770 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003771 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3772 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003773 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003774 fs_codec->error_handler,
3775 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003776 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003777#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003778 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003779 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003780 fs_codec->encoding,
3781 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003782 }
Victor Stinnerad158722010-10-27 00:25:46 +00003783#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003784 else {
3785 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3786 machinery is not ready and so cannot be used:
3787 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003788 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3789 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003790 assert(filesystem_errors != NULL);
3791 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3792 assert(errors != _Py_ERROR_UNKNOWN);
3793#ifdef _Py_FORCE_UTF8_FS_ENCODING
3794 return unicode_encode_utf8(unicode, errors, NULL);
3795#else
3796 return unicode_encode_locale(unicode, errors, 0);
3797#endif
3798 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003799}
3800
Alexander Belopolsky40018472011-02-26 01:02:56 +00003801PyObject *
3802PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003803 const char *encoding,
3804 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805{
3806 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003807 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003808
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 if (!PyUnicode_Check(unicode)) {
3810 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 }
Fred Drakee4315f52000-05-09 19:53:39 +00003813
Victor Stinner22eb6892019-06-26 00:51:05 +02003814 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3815 return NULL;
3816 }
3817
Victor Stinner942889a2016-09-05 15:40:10 -07003818 if (encoding == NULL) {
3819 return _PyUnicode_AsUTF8String(unicode, errors);
3820 }
3821
Fred Drakee4315f52000-05-09 19:53:39 +00003822 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003823 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3824 char *lower = buflower;
3825
3826 /* Fast paths */
3827 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3828 lower += 3;
3829 if (*lower == '_') {
3830 /* Match "utf8" and "utf_8" */
3831 lower++;
3832 }
3833
3834 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003836 }
3837 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3838 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3839 }
3840 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3841 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3842 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003843 }
Victor Stinner942889a2016-09-05 15:40:10 -07003844 else {
3845 if (strcmp(lower, "ascii") == 0
3846 || strcmp(lower, "us_ascii") == 0) {
3847 return _PyUnicode_AsASCIIString(unicode, errors);
3848 }
Steve Dowercc16be82016-09-08 10:35:16 -07003849#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003850 else if (strcmp(lower, "mbcs") == 0) {
3851 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3852 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003853#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003854 else if (strcmp(lower, "latin1") == 0 ||
3855 strcmp(lower, "latin_1") == 0 ||
3856 strcmp(lower, "iso_8859_1") == 0 ||
3857 strcmp(lower, "iso8859_1") == 0) {
3858 return _PyUnicode_AsLatin1String(unicode, errors);
3859 }
3860 }
Victor Stinner37296e82010-06-10 13:36:23 +00003861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862
3863 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003864 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003866 return NULL;
3867
3868 /* The normal path */
3869 if (PyBytes_Check(v))
3870 return v;
3871
3872 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003873 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003874 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003875 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003876
3877 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003878 "encoder %s returned bytearray instead of bytes; "
3879 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003880 encoding);
3881 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003882 Py_DECREF(v);
3883 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003884 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003885
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003886 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3887 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003888 Py_DECREF(v);
3889 return b;
3890 }
3891
3892 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003893 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003894 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003895 encoding,
3896 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003897 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003898 return NULL;
3899}
3900
Alexander Belopolsky40018472011-02-26 01:02:56 +00003901PyObject *
3902PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003903 const char *encoding,
3904 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003905{
3906 PyObject *v;
3907
3908 if (!PyUnicode_Check(unicode)) {
3909 PyErr_BadArgument();
3910 goto onError;
3911 }
3912
Serhiy Storchaka00939072016-10-27 21:05:49 +03003913 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3914 "PyUnicode_AsEncodedUnicode() is deprecated; "
3915 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3916 return NULL;
3917
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003918 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003920
3921 /* Encode via the codec registry */
3922 v = PyCodec_Encode(unicode, encoding, errors);
3923 if (v == NULL)
3924 goto onError;
3925 if (!PyUnicode_Check(v)) {
3926 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003927 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003928 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003929 encoding,
3930 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003931 Py_DECREF(v);
3932 goto onError;
3933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003935
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 return NULL;
3938}
3939
Victor Stinner2cba6b82018-01-10 22:46:15 +01003940static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003941unicode_decode_locale(const char *str, Py_ssize_t len,
3942 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003943{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003944 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3945 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003946 return NULL;
3947 }
3948
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003949 wchar_t *wstr;
3950 size_t wlen;
3951 const char *reason;
3952 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003953 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003954 if (res != 0) {
3955 if (res == -2) {
3956 PyObject *exc;
3957 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3958 "locale", str, len,
3959 (Py_ssize_t)wlen,
3960 (Py_ssize_t)(wlen + 1),
3961 reason);
3962 if (exc != NULL) {
3963 PyCodec_StrictErrors(exc);
3964 Py_DECREF(exc);
3965 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003966 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003967 else if (res == -3) {
3968 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3969 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003970 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003971 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003972 }
Victor Stinner2f197072011-12-17 07:08:30 +01003973 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003974 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003975
3976 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3977 PyMem_RawFree(wstr);
3978 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003979}
3980
3981PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003982PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3983 const char *errors)
3984{
Victor Stinner709d23d2019-05-02 14:56:30 -04003985 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3986 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003987}
3988
3989PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003990PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003991{
3992 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003993 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3994 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003995}
3996
3997
3998PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003999PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004000 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004001 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4002}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004003
Christian Heimes5894ba72007-11-04 11:43:14 +00004004PyObject*
4005PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4006{
Victor Stinner81a7be32020-04-14 15:14:01 +02004007 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004008 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4009 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004010 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004011 fs_codec->error_handler,
4012 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004013 NULL);
4014 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004015#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004016 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004017 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004018 fs_codec->encoding,
4019 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004020 }
Victor Stinnerad158722010-10-27 00:25:46 +00004021#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004022 else {
4023 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4024 machinery is not ready and so cannot be used:
4025 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004026 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4027 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004028 assert(filesystem_errors != NULL);
4029 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4030 assert(errors != _Py_ERROR_UNKNOWN);
4031#ifdef _Py_FORCE_UTF8_FS_ENCODING
4032 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4033#else
4034 return unicode_decode_locale(s, size, errors, 0);
4035#endif
4036 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004037}
4038
Martin v. Löwis011e8422009-05-05 04:43:17 +00004039
4040int
4041PyUnicode_FSConverter(PyObject* arg, void* addr)
4042{
Brett Cannonec6ce872016-09-06 15:50:29 -07004043 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004044 PyObject *output = NULL;
4045 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004046 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004047 if (arg == NULL) {
4048 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004049 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004050 return 1;
4051 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004052 path = PyOS_FSPath(arg);
4053 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004054 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004055 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004056 if (PyBytes_Check(path)) {
4057 output = path;
4058 }
4059 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4060 output = PyUnicode_EncodeFSDefault(path);
4061 Py_DECREF(path);
4062 if (!output) {
4063 return 0;
4064 }
4065 assert(PyBytes_Check(output));
4066 }
4067
Victor Stinner0ea2a462010-04-30 00:22:08 +00004068 size = PyBytes_GET_SIZE(output);
4069 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004070 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004071 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004072 Py_DECREF(output);
4073 return 0;
4074 }
4075 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004076 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004077}
4078
4079
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004080int
4081PyUnicode_FSDecoder(PyObject* arg, void* addr)
4082{
Brett Cannona5711202016-09-06 19:36:01 -07004083 int is_buffer = 0;
4084 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004085 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004086 if (arg == NULL) {
4087 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004088 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004089 return 1;
4090 }
Brett Cannona5711202016-09-06 19:36:01 -07004091
4092 is_buffer = PyObject_CheckBuffer(arg);
4093 if (!is_buffer) {
4094 path = PyOS_FSPath(arg);
4095 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004096 return 0;
4097 }
Brett Cannona5711202016-09-06 19:36:01 -07004098 }
4099 else {
4100 path = arg;
4101 Py_INCREF(arg);
4102 }
4103
4104 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004105 output = path;
4106 }
4107 else if (PyBytes_Check(path) || is_buffer) {
4108 PyObject *path_bytes = NULL;
4109
4110 if (!PyBytes_Check(path) &&
4111 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004112 "path should be string, bytes, or os.PathLike, not %.200s",
4113 Py_TYPE(arg)->tp_name)) {
4114 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004115 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004116 }
4117 path_bytes = PyBytes_FromObject(path);
4118 Py_DECREF(path);
4119 if (!path_bytes) {
4120 return 0;
4121 }
4122 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4123 PyBytes_GET_SIZE(path_bytes));
4124 Py_DECREF(path_bytes);
4125 if (!output) {
4126 return 0;
4127 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004128 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004129 else {
4130 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004131 "path should be string, bytes, or os.PathLike, not %.200s",
4132 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004133 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004134 return 0;
4135 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004136 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004137 Py_DECREF(output);
4138 return 0;
4139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004141 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004142 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004143 Py_DECREF(output);
4144 return 0;
4145 }
4146 *(PyObject**)addr = output;
4147 return Py_CLEANUP_SUPPORTED;
4148}
4149
4150
Inada Naoki02a4d572020-02-27 13:48:59 +09004151static int unicode_fill_utf8(PyObject *unicode);
4152
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004153const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004155{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004156 if (!PyUnicode_Check(unicode)) {
4157 PyErr_BadArgument();
4158 return NULL;
4159 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004160 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004161 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004163 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004164 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 return NULL;
4166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 }
4168
4169 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004170 *psize = PyUnicode_UTF8_LENGTH(unicode);
4171 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004172}
4173
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004174const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004175PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4178}
4179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004180Py_UNICODE *
4181PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183 if (!PyUnicode_Check(unicode)) {
4184 PyErr_BadArgument();
4185 return NULL;
4186 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004187 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4188 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004190 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004191 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192
Serhiy Storchakac46db922018-10-23 22:58:24 +03004193 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4194 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4195 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004198 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4199 if (w == NULL) {
4200 PyErr_NoMemory();
4201 return NULL;
4202 }
4203 unicode_copy_as_widechar(unicode, w, wlen + 1);
4204 _PyUnicode_WSTR(unicode) = w;
4205 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4206 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 }
4208 }
4209 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004210 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004211 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004212}
4213
Inada Naoki2c4928d2020-06-17 20:09:44 +09004214/* Deprecated APIs */
4215
4216_Py_COMP_DIAG_PUSH
4217_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4218
Alexander Belopolsky40018472011-02-26 01:02:56 +00004219Py_UNICODE *
4220PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004222 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223}
4224
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004225const Py_UNICODE *
4226_PyUnicode_AsUnicode(PyObject *unicode)
4227{
4228 Py_ssize_t size;
4229 const Py_UNICODE *wstr;
4230
4231 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4232 if (wstr && wcslen(wstr) != (size_t)size) {
4233 PyErr_SetString(PyExc_ValueError, "embedded null character");
4234 return NULL;
4235 }
4236 return wstr;
4237}
4238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239
Alexander Belopolsky40018472011-02-26 01:02:56 +00004240Py_ssize_t
4241PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242{
4243 if (!PyUnicode_Check(unicode)) {
4244 PyErr_BadArgument();
4245 goto onError;
4246 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004247 if (_PyUnicode_WSTR(unicode) == NULL) {
4248 if (PyUnicode_AsUnicode(unicode) == NULL)
4249 goto onError;
4250 }
4251 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 return -1;
4255}
4256
Inada Naoki2c4928d2020-06-17 20:09:44 +09004257_Py_COMP_DIAG_POP
4258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004259Py_ssize_t
4260PyUnicode_GetLength(PyObject *unicode)
4261{
Victor Stinner07621332012-06-16 04:53:46 +02004262 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 PyErr_BadArgument();
4264 return -1;
4265 }
Victor Stinner07621332012-06-16 04:53:46 +02004266 if (PyUnicode_READY(unicode) == -1)
4267 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 return PyUnicode_GET_LENGTH(unicode);
4269}
4270
4271Py_UCS4
4272PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4273{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004274 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004275 int kind;
4276
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004277 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004278 PyErr_BadArgument();
4279 return (Py_UCS4)-1;
4280 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004281 if (PyUnicode_READY(unicode) == -1) {
4282 return (Py_UCS4)-1;
4283 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004284 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004285 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 return (Py_UCS4)-1;
4287 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004288 data = PyUnicode_DATA(unicode);
4289 kind = PyUnicode_KIND(unicode);
4290 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291}
4292
4293int
4294PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4295{
4296 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004297 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004298 return -1;
4299 }
Victor Stinner488fa492011-12-12 00:01:39 +01004300 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004301 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004302 PyErr_SetString(PyExc_IndexError, "string index out of range");
4303 return -1;
4304 }
Victor Stinner488fa492011-12-12 00:01:39 +01004305 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004306 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004307 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4308 PyErr_SetString(PyExc_ValueError, "character out of range");
4309 return -1;
4310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004311 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4312 index, ch);
4313 return 0;
4314}
4315
Alexander Belopolsky40018472011-02-26 01:02:56 +00004316const char *
4317PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004318{
Victor Stinner42cb4622010-09-01 19:39:01 +00004319 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004320}
4321
Victor Stinner554f3f02010-06-16 23:33:54 +00004322/* create or adjust a UnicodeDecodeError */
4323static void
4324make_decode_exception(PyObject **exceptionObject,
4325 const char *encoding,
4326 const char *input, Py_ssize_t length,
4327 Py_ssize_t startpos, Py_ssize_t endpos,
4328 const char *reason)
4329{
4330 if (*exceptionObject == NULL) {
4331 *exceptionObject = PyUnicodeDecodeError_Create(
4332 encoding, input, length, startpos, endpos, reason);
4333 }
4334 else {
4335 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4336 goto onError;
4337 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4338 goto onError;
4339 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4340 goto onError;
4341 }
4342 return;
4343
4344onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004345 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004346}
4347
Steve Dowercc16be82016-09-08 10:35:16 -07004348#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004349static int
4350widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4351{
4352 if (newsize > *size) {
4353 wchar_t *newbuf = *buf;
4354 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4355 PyErr_NoMemory();
4356 return -1;
4357 }
4358 *buf = newbuf;
4359 }
4360 *size = newsize;
4361 return 0;
4362}
4363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364/* error handling callback helper:
4365 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004366 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 and adjust various state variables.
4368 return 0 on success, -1 on error
4369*/
4370
Alexander Belopolsky40018472011-02-26 01:02:56 +00004371static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004372unicode_decode_call_errorhandler_wchar(
4373 const char *errors, PyObject **errorHandler,
4374 const char *encoding, const char *reason,
4375 const char **input, const char **inend, Py_ssize_t *startinpos,
4376 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004377 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004379 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380
4381 PyObject *restuple = NULL;
4382 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004383 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004384 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t requiredsize;
4386 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004387 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389
4390 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 *errorHandler = PyCodec_LookupError(errors);
4392 if (*errorHandler == NULL)
4393 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 }
4395
Victor Stinner554f3f02010-06-16 23:33:54 +00004396 make_decode_exception(exceptionObject,
4397 encoding,
4398 *input, *inend - *input,
4399 *startinpos, *endinpos,
4400 reason);
4401 if (*exceptionObject == NULL)
4402 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403
Petr Viktorinffd97532020-02-11 17:46:57 +01004404 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004408 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004411 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413
4414 /* Copy back the bytes variables, which might have been modified by the
4415 callback */
4416 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4417 if (!inputobj)
4418 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 *input = PyBytes_AS_STRING(inputobj);
4420 insize = PyBytes_GET_SIZE(inputobj);
4421 *inend = *input + insize;
4422 /* we can DECREF safely, as the exception has another reference,
4423 so the object won't go away. */
4424 Py_DECREF(inputobj);
4425
4426 if (newpos<0)
4427 newpos = insize+newpos;
4428 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004429 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 goto onError;
4431 }
4432
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004433#if USE_UNICODE_WCHAR_CACHE
4434_Py_COMP_DIAG_PUSH
4435_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4436 repwlen = PyUnicode_GetSize(repunicode);
4437 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004439_Py_COMP_DIAG_POP
4440#else /* USE_UNICODE_WCHAR_CACHE */
4441 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4442 if (repwlen < 0)
4443 goto onError;
4444 repwlen--;
4445#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 /* need more space? (at least enough for what we
4447 have+the replacement+the rest of the string (starting
4448 at the new input position), so we won't have to check space
4449 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004450 requiredsize = *outpos;
4451 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4452 goto overflow;
4453 requiredsize += repwlen;
4454 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4455 goto overflow;
4456 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004457 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004459 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004460 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004461 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004463 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004464 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004465 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467 *endinpos = newpos;
4468 *inptr = *input + newpos;
4469
4470 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004471 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004472 return 0;
4473
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004474 overflow:
4475 PyErr_SetString(PyExc_OverflowError,
4476 "decoded result is too long for a Python string");
4477
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004478 onError:
4479 Py_XDECREF(restuple);
4480 return -1;
4481}
Steve Dowercc16be82016-09-08 10:35:16 -07004482#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483
4484static int
4485unicode_decode_call_errorhandler_writer(
4486 const char *errors, PyObject **errorHandler,
4487 const char *encoding, const char *reason,
4488 const char **input, const char **inend, Py_ssize_t *startinpos,
4489 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4490 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4491{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004492 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004493
4494 PyObject *restuple = NULL;
4495 PyObject *repunicode = NULL;
4496 Py_ssize_t insize;
4497 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004498 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004499 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004500 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004501 int need_to_grow = 0;
4502 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004503
4504 if (*errorHandler == NULL) {
4505 *errorHandler = PyCodec_LookupError(errors);
4506 if (*errorHandler == NULL)
4507 goto onError;
4508 }
4509
4510 make_decode_exception(exceptionObject,
4511 encoding,
4512 *input, *inend - *input,
4513 *startinpos, *endinpos,
4514 reason);
4515 if (*exceptionObject == NULL)
4516 goto onError;
4517
Petr Viktorinffd97532020-02-11 17:46:57 +01004518 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004519 if (restuple == NULL)
4520 goto onError;
4521 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004522 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004523 goto onError;
4524 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004525 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004526 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004527
4528 /* Copy back the bytes variables, which might have been modified by the
4529 callback */
4530 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4531 if (!inputobj)
4532 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004533 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004534 *input = PyBytes_AS_STRING(inputobj);
4535 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004536 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004537 /* we can DECREF safely, as the exception has another reference,
4538 so the object won't go away. */
4539 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004543 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004544 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547
Victor Stinner170ca6f2013-04-18 00:25:28 +02004548 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004549 if (replen > 1) {
4550 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004551 need_to_grow = 1;
4552 }
4553 new_inptr = *input + newpos;
4554 if (*inend - new_inptr > remain) {
4555 /* We don't know the decoding algorithm here so we make the worst
4556 assumption that one byte decodes to one unicode character.
4557 If unfortunately one byte could decode to more unicode characters,
4558 the decoder may write out-of-bound then. Is it possible for the
4559 algorithms using this function? */
4560 writer->min_length += *inend - new_inptr - remain;
4561 need_to_grow = 1;
4562 }
4563 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004564 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004565 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004566 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4567 goto onError;
4568 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004569 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004570 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004573 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004576 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582}
4583
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584/* --- UTF-7 Codec -------------------------------------------------------- */
4585
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586/* See RFC2152 for details. We encode conservatively and decode liberally. */
4587
4588/* Three simple macros defining base-64. */
4589
4590/* Is c a base-64 character? */
4591
4592#define IS_BASE64(c) \
4593 (((c) >= 'A' && (c) <= 'Z') || \
4594 ((c) >= 'a' && (c) <= 'z') || \
4595 ((c) >= '0' && (c) <= '9') || \
4596 (c) == '+' || (c) == '/')
4597
4598/* given that c is a base-64 character, what is its base-64 value? */
4599
4600#define FROM_BASE64(c) \
4601 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4602 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4603 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4604 (c) == '+' ? 62 : 63)
4605
4606/* What is the base-64 character of the bottom 6 bits of n? */
4607
4608#define TO_BASE64(n) \
4609 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4610
4611/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4612 * decoded as itself. We are permissive on decoding; the only ASCII
4613 * byte not decoding to itself is the + which begins a base64
4614 * string. */
4615
4616#define DECODE_DIRECT(c) \
4617 ((c) <= 127 && (c) != '+')
4618
4619/* The UTF-7 encoder treats ASCII characters differently according to
4620 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4621 * the above). See RFC2152. This array identifies these different
4622 * sets:
4623 * 0 : "Set D"
4624 * alphanumeric and '(),-./:?
4625 * 1 : "Set O"
4626 * !"#$%&*;<=>@[]^_`{|}
4627 * 2 : "whitespace"
4628 * ht nl cr sp
4629 * 3 : special (must be base64 encoded)
4630 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4631 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632
Tim Petersced69f82003-09-16 20:30:58 +00004633static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634char utf7_category[128] = {
4635/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4636 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4637/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4638 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4639/* sp ! " # $ % & ' ( ) * + , - . / */
4640 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4641/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4643/* @ A B C D E F G H I J K L M N O */
4644 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4645/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4647/* ` a b c d e f g h i j k l m n o */
4648 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4649/* p q r s t u v w x y z { | } ~ del */
4650 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004651};
4652
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653/* ENCODE_DIRECT: this character should be encoded as itself. The
4654 * answer depends on whether we are encoding set O as itself, and also
4655 * on whether we are encoding whitespace as itself. RFC2152 makes it
4656 * clear that the answers to these questions vary between
4657 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004658
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659#define ENCODE_DIRECT(c, directO, directWS) \
4660 ((c) < 128 && (c) > 0 && \
4661 ((utf7_category[(c)] == 0) || \
4662 (directWS && (utf7_category[(c)] == 2)) || \
4663 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664
Alexander Belopolsky40018472011-02-26 01:02:56 +00004665PyObject *
4666PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004667 Py_ssize_t size,
4668 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004670 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4671}
4672
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673/* The decoder. The only state we preserve is our read position,
4674 * i.e. how many characters we have consumed. So if we end in the
4675 * middle of a shift sequence we have to back off the read position
4676 * and the output to the beginning of the sequence, otherwise we lose
4677 * all the shift state (seen bits, number of bits seen, high
4678 * surrogate). */
4679
Alexander Belopolsky40018472011-02-26 01:02:56 +00004680PyObject *
4681PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004682 Py_ssize_t size,
4683 const char *errors,
4684 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 Py_ssize_t startinpos;
4688 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 const char *errmsg = "";
4692 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004693 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 unsigned int base64bits = 0;
4695 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004696 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 PyObject *errorHandler = NULL;
4698 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 if (size == 0) {
4701 if (consumed)
4702 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004703 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004704 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004707 _PyUnicodeWriter_Init(&writer);
4708 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709
4710 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711 e = s + size;
4712
4713 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004714 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004716 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717
Antoine Pitrou244651a2009-05-04 18:56:13 +00004718 if (inShift) { /* in a base-64 section */
4719 if (IS_BASE64(ch)) { /* consume a base-64 character */
4720 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4721 base64bits += 6;
4722 s++;
4723 if (base64bits >= 16) {
4724 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004725 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 base64bits -= 16;
4727 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004728 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 if (surrogate) {
4730 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004731 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4732 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004733 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004734 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004736 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 }
4738 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004739 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004740 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 }
4743 }
Victor Stinner551ac952011-11-29 22:58:13 +01004744 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745 /* first surrogate */
4746 surrogate = outCh;
4747 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004748 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004749 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004750 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 }
4752 }
4753 }
4754 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 if (base64bits > 0) { /* left-over bits */
4757 if (base64bits >= 6) {
4758 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004759 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 errmsg = "partial character in shift sequence";
4761 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 else {
4764 /* Some bits remain; they should be zero */
4765 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004766 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 errmsg = "non-zero padding bits in shift sequence";
4768 goto utf7Error;
4769 }
4770 }
4771 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004772 if (surrogate && DECODE_DIRECT(ch)) {
4773 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4774 goto onError;
4775 }
4776 surrogate = 0;
4777 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 /* '-' is absorbed; other terminating
4779 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004780 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004781 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 }
4783 }
4784 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 s++; /* consume '+' */
4787 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004789 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004790 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004792 else if (s < e && !IS_BASE64(*s)) {
4793 s++;
4794 errmsg = "ill-formed sequence";
4795 goto utf7Error;
4796 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004798 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004799 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004800 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004802 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004803 }
4804 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004805 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004807 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004809 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 else {
4811 startinpos = s-starts;
4812 s++;
4813 errmsg = "unexpected special character";
4814 goto utf7Error;
4815 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004816 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004819 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 errors, &errorHandler,
4821 "utf7", errmsg,
4822 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004823 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004825 }
4826
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 /* end of string */
4828
4829 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4830 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004831 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 if (surrogate ||
4833 (base64bits >= 6) ||
4834 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004836 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004837 errors, &errorHandler,
4838 "utf7", "unterminated shift sequence",
4839 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004840 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 goto onError;
4842 if (s < e)
4843 goto restart;
4844 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846
4847 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004848 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004849 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004850 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004851 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004852 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004853 writer.kind, writer.data, shiftOutStart);
4854 Py_XDECREF(errorHandler);
4855 Py_XDECREF(exc);
4856 _PyUnicodeWriter_Dealloc(&writer);
4857 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004858 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004859 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004860 }
4861 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004862 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004864 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 Py_XDECREF(errorHandler);
4867 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004868 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004869
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 Py_XDECREF(errorHandler);
4872 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004873 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874 return NULL;
4875}
4876
4877
Alexander Belopolsky40018472011-02-26 01:02:56 +00004878PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004879_PyUnicode_EncodeUTF7(PyObject *str,
4880 int base64SetO,
4881 int base64WhiteSpace,
4882 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004883{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004884 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004885 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004886 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004887 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004888 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004889 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004890 unsigned int base64bits = 0;
4891 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004892 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004893 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004894
Benjamin Petersonbac79492012-01-14 13:34:47 -05004895 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004896 return NULL;
4897 kind = PyUnicode_KIND(str);
4898 data = PyUnicode_DATA(str);
4899 len = PyUnicode_GET_LENGTH(str);
4900
4901 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004903
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004904 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004905 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004906 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004907 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004908 if (v == NULL)
4909 return NULL;
4910
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004911 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004912 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004913 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004914
Antoine Pitrou244651a2009-05-04 18:56:13 +00004915 if (inShift) {
4916 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4917 /* shifting out */
4918 if (base64bits) { /* output remaining bits */
4919 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4920 base64buffer = 0;
4921 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004922 }
4923 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004924 /* Characters not in the BASE64 set implicitly unshift the sequence
4925 so no '-' is required, except if the character is itself a '-' */
4926 if (IS_BASE64(ch) || ch == '-') {
4927 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004928 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004929 *out++ = (char) ch;
4930 }
4931 else {
4932 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004933 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004934 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004935 else { /* not in a shift sequence */
4936 if (ch == '+') {
4937 *out++ = '+';
4938 *out++ = '-';
4939 }
4940 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4941 *out++ = (char) ch;
4942 }
4943 else {
4944 *out++ = '+';
4945 inShift = 1;
4946 goto encode_char;
4947 }
4948 }
4949 continue;
4950encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004951 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004952 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004953
Antoine Pitrou244651a2009-05-04 18:56:13 +00004954 /* code first surrogate */
4955 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004956 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004957 while (base64bits >= 6) {
4958 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4959 base64bits -= 6;
4960 }
4961 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004962 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004963 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004964 base64bits += 16;
4965 base64buffer = (base64buffer << 16) | ch;
4966 while (base64bits >= 6) {
4967 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4968 base64bits -= 6;
4969 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004970 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004971 if (base64bits)
4972 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4973 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004974 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004975 if (_PyBytes_Resize(&v, out - start) < 0)
4976 return NULL;
4977 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004978}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004979PyObject *
4980PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4981 Py_ssize_t size,
4982 int base64SetO,
4983 int base64WhiteSpace,
4984 const char *errors)
4985{
4986 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004987 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004988 if (tmp == NULL)
4989 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004990 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004991 base64WhiteSpace, errors);
4992 Py_DECREF(tmp);
4993 return result;
4994}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004995
Antoine Pitrou244651a2009-05-04 18:56:13 +00004996#undef IS_BASE64
4997#undef FROM_BASE64
4998#undef TO_BASE64
4999#undef DECODE_DIRECT
5000#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005001
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002/* --- UTF-8 Codec -------------------------------------------------------- */
5003
Alexander Belopolsky40018472011-02-26 01:02:56 +00005004PyObject *
5005PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005006 Py_ssize_t size,
5007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008{
Walter Dörwald69652032004-09-07 20:24:22 +00005009 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5010}
5011
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012#include "stringlib/asciilib.h"
5013#include "stringlib/codecs.h"
5014#include "stringlib/undef.h"
5015
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005016#include "stringlib/ucs1lib.h"
5017#include "stringlib/codecs.h"
5018#include "stringlib/undef.h"
5019
5020#include "stringlib/ucs2lib.h"
5021#include "stringlib/codecs.h"
5022#include "stringlib/undef.h"
5023
5024#include "stringlib/ucs4lib.h"
5025#include "stringlib/codecs.h"
5026#include "stringlib/undef.h"
5027
Ma Lina0c603c2020-10-18 22:48:38 +08005028/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005029 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005030#if (SIZEOF_SIZE_T == 8)
5031# define ASCII_CHAR_MASK 0x8080808080808080ULL
5032#elif (SIZEOF_SIZE_T == 4)
5033# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005034#else
Ma Lina0c603c2020-10-18 22:48:38 +08005035# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005036#endif
5037
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038static Py_ssize_t
5039ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005041 const char *p = start;
Ma Lina0c603c2020-10-18 22:48:38 +08005042 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_SIZE_T);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005043
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005044 /*
5045 * Issue #17237: m68k is a bit different from most architectures in
5046 * that objects do not use "natural alignment" - for example, int and
5047 * long are only aligned at 2-byte boundaries. Therefore the assert()
5048 * won't work; also, tests have shown that skipping the "optimised
5049 * version" will even speed up m68k.
5050 */
5051#if !defined(__m68k__)
Ma Lina0c603c2020-10-18 22:48:38 +08005052#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5053 assert(_Py_IS_ALIGNED(dest, SIZEOF_SIZE_T));
5054 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005055 /* Fast path, see in STRINGLIB(utf8_decode) for
5056 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005057 /* Help allocation */
5058 const char *_p = p;
5059 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005061 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005064 *((size_t *)q) = value;
5065 _p += SIZEOF_SIZE_T;
5066 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005067 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 p = _p;
5069 while (p < end) {
5070 if ((unsigned char)*p & 0x80)
5071 break;
5072 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005077#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 while (p < end) {
5079 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5080 for an explanation. */
Ma Lina0c603c2020-10-18 22:48:38 +08005081 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005082 /* Help allocation */
5083 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005085 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086 if (value & ASCII_CHAR_MASK)
5087 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005088 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 }
5090 p = _p;
5091 if (_p == end)
5092 break;
5093 }
5094 if ((unsigned char)*p & 0x80)
5095 break;
5096 ++p;
5097 }
5098 memcpy(dest, start, p - start);
5099 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100}
Antoine Pitrouab868312009-01-10 15:40:25 +00005101
Victor Stinner709d23d2019-05-02 14:56:30 -04005102static PyObject *
5103unicode_decode_utf8(const char *s, Py_ssize_t size,
5104 _Py_error_handler error_handler, const char *errors,
5105 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005106{
Victor Stinner785938e2011-12-11 20:09:03 +01005107 if (size == 0) {
5108 if (consumed)
5109 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005110 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005111 }
5112
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5114 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005115 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005117 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005118 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005119 }
5120
Inada Naoki770847a2019-06-24 12:30:24 +09005121 const char *starts = s;
5122 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005123
Inada Naoki770847a2019-06-24 12:30:24 +09005124 // fast path: try ASCII string.
5125 PyObject *u = PyUnicode_New(size, 127);
5126 if (u == NULL) {
5127 return NULL;
5128 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005129 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005130 if (s == end) {
5131 return u;
5132 }
5133
5134 // Use _PyUnicodeWriter after fast path is failed.
5135 _PyUnicodeWriter writer;
5136 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5137 writer.pos = s - starts;
5138
5139 Py_ssize_t startinpos, endinpos;
5140 const char *errmsg = "";
5141 PyObject *error_handler_obj = NULL;
5142 PyObject *exc = NULL;
5143
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144 while (s < end) {
5145 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005146 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005147
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005149 if (PyUnicode_IS_ASCII(writer.buffer))
5150 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005152 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005154 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 } else {
5156 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005157 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158 }
5159
5160 switch (ch) {
5161 case 0:
5162 if (s == end || consumed)
5163 goto End;
5164 errmsg = "unexpected end of data";
5165 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005166 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005167 break;
5168 case 1:
5169 errmsg = "invalid start byte";
5170 startinpos = s - starts;
5171 endinpos = startinpos + 1;
5172 break;
5173 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005174 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5175 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5176 {
5177 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005178 goto End;
5179 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005180 /* fall through */
5181 case 3:
5182 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005183 errmsg = "invalid continuation byte";
5184 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005185 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 break;
5187 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005188 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005189 goto onError;
5190 continue;
5191 }
5192
Victor Stinner1d65d912015-10-05 13:43:50 +02005193 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005194 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005195
5196 switch (error_handler) {
5197 case _Py_ERROR_IGNORE:
5198 s += (endinpos - startinpos);
5199 break;
5200
5201 case _Py_ERROR_REPLACE:
5202 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5203 goto onError;
5204 s += (endinpos - startinpos);
5205 break;
5206
5207 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005208 {
5209 Py_ssize_t i;
5210
Victor Stinner1d65d912015-10-05 13:43:50 +02005211 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5212 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005213 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005214 ch = (Py_UCS4)(unsigned char)(starts[i]);
5215 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5216 ch + 0xdc00);
5217 writer.pos++;
5218 }
5219 s += (endinpos - startinpos);
5220 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005221 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005222
5223 default:
5224 if (unicode_decode_call_errorhandler_writer(
5225 errors, &error_handler_obj,
5226 "utf-8", errmsg,
5227 &starts, &end, &startinpos, &endinpos, &exc, &s,
5228 &writer))
5229 goto onError;
5230 }
Victor Stinner785938e2011-12-11 20:09:03 +01005231 }
5232
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005233End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005234 if (consumed)
5235 *consumed = s - starts;
5236
Victor Stinner1d65d912015-10-05 13:43:50 +02005237 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005238 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005239 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005240
5241onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005242 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005243 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005245 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005246}
5247
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005248
Victor Stinner709d23d2019-05-02 14:56:30 -04005249PyObject *
5250PyUnicode_DecodeUTF8Stateful(const char *s,
5251 Py_ssize_t size,
5252 const char *errors,
5253 Py_ssize_t *consumed)
5254{
5255 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5256}
5257
5258
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005259/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5260 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005261
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005262 On success, write a pointer to a newly allocated wide character string into
5263 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5264 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005265
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005266 On memory allocation failure, return -1.
5267
5268 On decoding error (if surrogateescape is zero), return -2. If wlen is
5269 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5270 is not NULL, write the decoding error message into *reason. */
5271int
5272_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005273 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005274{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005276 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005277 wchar_t *unicode;
5278 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005279
Victor Stinner3d4226a2018-08-29 22:21:32 +02005280 int surrogateescape = 0;
5281 int surrogatepass = 0;
5282 switch (errors)
5283 {
5284 case _Py_ERROR_STRICT:
5285 break;
5286 case _Py_ERROR_SURROGATEESCAPE:
5287 surrogateescape = 1;
5288 break;
5289 case _Py_ERROR_SURROGATEPASS:
5290 surrogatepass = 1;
5291 break;
5292 default:
5293 return -3;
5294 }
5295
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005296 /* Note: size will always be longer than the resulting Unicode
5297 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005298 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005299 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005300 }
5301
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005302 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005303 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005305 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005306
5307 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005308 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005309 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005310 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005311 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005312#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005313 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005314#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005315 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005316#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005317 if (ch > 0xFF) {
5318#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005319 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005320#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005321 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005322 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005323 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5324 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5325#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005326 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005327 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005328 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005329 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005330 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005331
5332 if (surrogateescape) {
5333 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5334 }
5335 else {
5336 /* Is it a valid three-byte code? */
5337 if (surrogatepass
5338 && (e - s) >= 3
5339 && (s[0] & 0xf0) == 0xe0
5340 && (s[1] & 0xc0) == 0x80
5341 && (s[2] & 0xc0) == 0x80)
5342 {
5343 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5344 s += 3;
5345 unicode[outpos++] = ch;
5346 }
5347 else {
5348 PyMem_RawFree(unicode );
5349 if (reason != NULL) {
5350 switch (ch) {
5351 case 0:
5352 *reason = "unexpected end of data";
5353 break;
5354 case 1:
5355 *reason = "invalid start byte";
5356 break;
5357 /* 2, 3, 4 */
5358 default:
5359 *reason = "invalid continuation byte";
5360 break;
5361 }
5362 }
5363 if (wlen != NULL) {
5364 *wlen = s - orig_s;
5365 }
5366 return -2;
5367 }
5368 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005369 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005370 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005371 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005372 if (wlen) {
5373 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005374 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005375 *wstr = unicode;
5376 return 0;
5377}
5378
Victor Stinner5f9cf232019-03-19 01:46:25 +01005379
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005380wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005381_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5382 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005383{
5384 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005385 int res = _Py_DecodeUTF8Ex(arg, arglen,
5386 &wstr, wlen,
5387 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005388 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005389 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5390 assert(res != -3);
5391 if (wlen) {
5392 *wlen = (size_t)res;
5393 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005394 return NULL;
5395 }
5396 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005397}
5398
Antoine Pitrouab868312009-01-10 15:40:25 +00005399
Victor Stinnere47e6982017-12-21 15:45:16 +01005400/* UTF-8 encoder using the surrogateescape error handler .
5401
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005402 On success, return 0 and write the newly allocated character string (use
5403 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005404
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005405 On encoding failure, return -2 and write the position of the invalid
5406 surrogate character into *error_pos (if error_pos is set) and the decoding
5407 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005408
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005409 On memory allocation failure, return -1. */
5410int
5411_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005412 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005413{
5414 const Py_ssize_t max_char_size = 4;
5415 Py_ssize_t len = wcslen(text);
5416
5417 assert(len >= 0);
5418
Victor Stinner3d4226a2018-08-29 22:21:32 +02005419 int surrogateescape = 0;
5420 int surrogatepass = 0;
5421 switch (errors)
5422 {
5423 case _Py_ERROR_STRICT:
5424 break;
5425 case _Py_ERROR_SURROGATEESCAPE:
5426 surrogateescape = 1;
5427 break;
5428 case _Py_ERROR_SURROGATEPASS:
5429 surrogatepass = 1;
5430 break;
5431 default:
5432 return -3;
5433 }
5434
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005435 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5436 return -1;
5437 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005438 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005439 if (raw_malloc) {
5440 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005441 }
5442 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005443 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005444 }
5445 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005446 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005447 }
5448
5449 char *p = bytes;
5450 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005451 for (i = 0; i < len; ) {
5452 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005453 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005454 i++;
5455#if Py_UNICODE_SIZE == 2
5456 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5457 && i < len
5458 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5459 {
5460 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5461 i++;
5462 }
5463#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005464
5465 if (ch < 0x80) {
5466 /* Encode ASCII */
5467 *p++ = (char) ch;
5468
5469 }
5470 else if (ch < 0x0800) {
5471 /* Encode Latin-1 */
5472 *p++ = (char)(0xc0 | (ch >> 6));
5473 *p++ = (char)(0x80 | (ch & 0x3f));
5474 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005475 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005476 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005477 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005478 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005479 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005480 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005481 if (reason != NULL) {
5482 *reason = "encoding error";
5483 }
5484 if (raw_malloc) {
5485 PyMem_RawFree(bytes);
5486 }
5487 else {
5488 PyMem_Free(bytes);
5489 }
5490 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005491 }
5492 *p++ = (char)(ch & 0xff);
5493 }
5494 else if (ch < 0x10000) {
5495 *p++ = (char)(0xe0 | (ch >> 12));
5496 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497 *p++ = (char)(0x80 | (ch & 0x3f));
5498 }
5499 else { /* ch >= 0x10000 */
5500 assert(ch <= MAX_UNICODE);
5501 /* Encode UCS4 Unicode ordinals */
5502 *p++ = (char)(0xf0 | (ch >> 18));
5503 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5504 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5505 *p++ = (char)(0x80 | (ch & 0x3f));
5506 }
5507 }
5508 *p++ = '\0';
5509
5510 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005511 char *bytes2;
5512 if (raw_malloc) {
5513 bytes2 = PyMem_RawRealloc(bytes, final_size);
5514 }
5515 else {
5516 bytes2 = PyMem_Realloc(bytes, final_size);
5517 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005518 if (bytes2 == NULL) {
5519 if (error_pos != NULL) {
5520 *error_pos = (size_t)-1;
5521 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005522 if (raw_malloc) {
5523 PyMem_RawFree(bytes);
5524 }
5525 else {
5526 PyMem_Free(bytes);
5527 }
5528 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005529 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005530 *str = bytes2;
5531 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005532}
5533
5534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535/* Primary internal function which creates utf8 encoded bytes objects.
5536
5537 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005538 and allocate exactly as much space needed at the end. Else allocate the
5539 maximum possible needed (4 result bytes per Unicode character), and return
5540 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005541*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005542static PyObject *
5543unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 if (!PyUnicode_Check(unicode)) {
5547 PyErr_BadArgument();
5548 return NULL;
5549 }
5550
5551 if (PyUnicode_READY(unicode) == -1)
5552 return NULL;
5553
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005554 if (PyUnicode_UTF8(unicode))
5555 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5556 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557
Inada Naoki02a4d572020-02-27 13:48:59 +09005558 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005559 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005560 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5561
5562 _PyBytesWriter writer;
5563 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564
Benjamin Petersonead6b532011-12-20 17:23:42 -06005565 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005566 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005567 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005568 case PyUnicode_1BYTE_KIND:
5569 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5570 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005571 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5572 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005573 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005574 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5575 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005576 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005577 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5578 break;
Tim Peters602f7402002-04-27 18:03:26 +00005579 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005580
5581 if (end == NULL) {
5582 _PyBytesWriter_Dealloc(&writer);
5583 return NULL;
5584 }
5585 return _PyBytesWriter_Finish(&writer, end);
5586}
5587
5588static int
5589unicode_fill_utf8(PyObject *unicode)
5590{
5591 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5592 assert(!PyUnicode_IS_ASCII(unicode));
5593
5594 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005595 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005596 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5597
5598 _PyBytesWriter writer;
5599 char *end;
5600
5601 switch (kind) {
5602 default:
5603 Py_UNREACHABLE();
5604 case PyUnicode_1BYTE_KIND:
5605 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5606 _Py_ERROR_STRICT, NULL);
5607 break;
5608 case PyUnicode_2BYTE_KIND:
5609 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5610 _Py_ERROR_STRICT, NULL);
5611 break;
5612 case PyUnicode_4BYTE_KIND:
5613 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5614 _Py_ERROR_STRICT, NULL);
5615 break;
5616 }
5617 if (end == NULL) {
5618 _PyBytesWriter_Dealloc(&writer);
5619 return -1;
5620 }
5621
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005622 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005623 PyBytes_AS_STRING(writer.buffer);
5624 Py_ssize_t len = end - start;
5625
5626 char *cache = PyObject_MALLOC(len + 1);
5627 if (cache == NULL) {
5628 _PyBytesWriter_Dealloc(&writer);
5629 PyErr_NoMemory();
5630 return -1;
5631 }
5632 _PyUnicode_UTF8(unicode) = cache;
5633 _PyUnicode_UTF8_LENGTH(unicode) = len;
5634 memcpy(cache, start, len);
5635 cache[len] = '\0';
5636 _PyBytesWriter_Dealloc(&writer);
5637 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638}
5639
Alexander Belopolsky40018472011-02-26 01:02:56 +00005640PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005641_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5642{
5643 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5644}
5645
5646
5647PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5649 Py_ssize_t size,
5650 const char *errors)
5651{
5652 PyObject *v, *unicode;
5653
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005654 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655 if (unicode == NULL)
5656 return NULL;
5657 v = _PyUnicode_AsUTF8String(unicode, errors);
5658 Py_DECREF(unicode);
5659 return v;
5660}
5661
5662PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005663PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666}
5667
Walter Dörwald41980ca2007-08-16 21:55:45 +00005668/* --- UTF-32 Codec ------------------------------------------------------- */
5669
5670PyObject *
5671PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 Py_ssize_t size,
5673 const char *errors,
5674 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005675{
5676 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5677}
5678
5679PyObject *
5680PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 Py_ssize_t size,
5682 const char *errors,
5683 int *byteorder,
5684 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005685{
5686 const char *starts = s;
5687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005690 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005691 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005692 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005693 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005694 PyObject *errorHandler = NULL;
5695 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005696
Andy Lestere6be9b52020-02-11 20:28:35 -06005697 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698 e = q + size;
5699
5700 if (byteorder)
5701 bo = *byteorder;
5702
5703 /* Check for BOM marks (U+FEFF) in the input and adjust current
5704 byte order setting accordingly. In native mode, the leading BOM
5705 mark is skipped, in all other modes, it is copied to the output
5706 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005707 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005708 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005709 if (bom == 0x0000FEFF) {
5710 bo = -1;
5711 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005713 else if (bom == 0xFFFE0000) {
5714 bo = 1;
5715 q += 4;
5716 }
5717 if (byteorder)
5718 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005719 }
5720
Victor Stinnere64322e2012-10-30 23:12:47 +01005721 if (q == e) {
5722 if (consumed)
5723 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005724 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005725 }
5726
Victor Stinnere64322e2012-10-30 23:12:47 +01005727#ifdef WORDS_BIGENDIAN
5728 le = bo < 0;
5729#else
5730 le = bo <= 0;
5731#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005733
Victor Stinner8f674cc2013-04-17 23:02:17 +02005734 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005735 writer.min_length = (e - q + 3) / 4;
5736 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005737 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005738
Victor Stinnere64322e2012-10-30 23:12:47 +01005739 while (1) {
5740 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005741 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005742
Victor Stinnere64322e2012-10-30 23:12:47 +01005743 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005744 enum PyUnicode_Kind kind = writer.kind;
5745 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005746 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005747 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005748 if (le) {
5749 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005750 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005751 if (ch > maxch)
5752 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 if (kind != PyUnicode_1BYTE_KIND &&
5754 Py_UNICODE_IS_SURROGATE(ch))
5755 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005756 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005757 q += 4;
5758 } while (q <= last);
5759 }
5760 else {
5761 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005762 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005763 if (ch > maxch)
5764 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 if (kind != PyUnicode_1BYTE_KIND &&
5766 Py_UNICODE_IS_SURROGATE(ch))
5767 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005769 q += 4;
5770 } while (q <= last);
5771 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005773 }
5774
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005776 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 startinpos = ((const char *)q) - starts;
5778 endinpos = startinpos + 4;
5779 }
5780 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005781 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005783 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005785 startinpos = ((const char *)q) - starts;
5786 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005788 else {
5789 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005790 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005791 goto onError;
5792 q += 4;
5793 continue;
5794 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005795 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005796 startinpos = ((const char *)q) - starts;
5797 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005799
5800 /* The remaining input chars are ignored if the callback
5801 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005804 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005806 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005808 }
5809
Walter Dörwald41980ca2007-08-16 21:55:45 +00005810 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005812
Walter Dörwald41980ca2007-08-16 21:55:45 +00005813 Py_XDECREF(errorHandler);
5814 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005815 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005816
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005818 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005819 Py_XDECREF(errorHandler);
5820 Py_XDECREF(exc);
5821 return NULL;
5822}
5823
5824PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005825_PyUnicode_EncodeUTF32(PyObject *str,
5826 const char *errors,
5827 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005828{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005829 enum PyUnicode_Kind kind;
5830 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005831 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005833 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005834#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005835 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005836#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005837 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005838#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005840 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005841 PyObject *errorHandler = NULL;
5842 PyObject *exc = NULL;
5843 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005844
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005845 if (!PyUnicode_Check(str)) {
5846 PyErr_BadArgument();
5847 return NULL;
5848 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005849 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850 return NULL;
5851 kind = PyUnicode_KIND(str);
5852 data = PyUnicode_DATA(str);
5853 len = PyUnicode_GET_LENGTH(str);
5854
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005855 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005856 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005857 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005858 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859 if (v == NULL)
5860 return NULL;
5861
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005862 /* output buffer is 4-bytes aligned */
5863 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005864 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005865 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005866 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005868 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005869
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005870 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005871 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005872 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005873 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005874 else
5875 encoding = "utf-32";
5876
5877 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005878 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5879 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005880 }
5881
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005882 pos = 0;
5883 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005884 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005885
5886 if (kind == PyUnicode_2BYTE_KIND) {
5887 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5888 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005889 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005890 else {
5891 assert(kind == PyUnicode_4BYTE_KIND);
5892 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5893 &out, native_ordering);
5894 }
5895 if (pos == len)
5896 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005897
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005898 rep = unicode_encode_call_errorhandler(
5899 errors, &errorHandler,
5900 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005901 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005902 if (!rep)
5903 goto error;
5904
5905 if (PyBytes_Check(rep)) {
5906 repsize = PyBytes_GET_SIZE(rep);
5907 if (repsize & 3) {
5908 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005909 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005910 "surrogates not allowed");
5911 goto error;
5912 }
5913 moreunits = repsize / 4;
5914 }
5915 else {
5916 assert(PyUnicode_Check(rep));
5917 if (PyUnicode_READY(rep) < 0)
5918 goto error;
5919 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5920 if (!PyUnicode_IS_ASCII(rep)) {
5921 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005922 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005923 "surrogates not allowed");
5924 goto error;
5925 }
5926 }
5927
5928 /* four bytes are reserved for each surrogate */
5929 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005930 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005931 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005932 /* integer overflow */
5933 PyErr_NoMemory();
5934 goto error;
5935 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005936 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005937 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005938 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005939 }
5940
5941 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005942 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005943 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005944 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005945 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005946 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5947 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005948 }
5949
5950 Py_CLEAR(rep);
5951 }
5952
5953 /* Cut back to size actually needed. This is necessary for, for example,
5954 encoding of a string containing isolated surrogates and the 'ignore'
5955 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005956 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005957 if (nsize != PyBytes_GET_SIZE(v))
5958 _PyBytes_Resize(&v, nsize);
5959 Py_XDECREF(errorHandler);
5960 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005961 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005962 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005963 error:
5964 Py_XDECREF(rep);
5965 Py_XDECREF(errorHandler);
5966 Py_XDECREF(exc);
5967 Py_XDECREF(v);
5968 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005969}
5970
Alexander Belopolsky40018472011-02-26 01:02:56 +00005971PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5973 Py_ssize_t size,
5974 const char *errors,
5975 int byteorder)
5976{
5977 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005978 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 if (tmp == NULL)
5980 return NULL;
5981 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5982 Py_DECREF(tmp);
5983 return result;
5984}
5985
5986PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005987PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005988{
Victor Stinnerb960b342011-11-20 19:12:52 +01005989 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005990}
5991
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992/* --- UTF-16 Codec ------------------------------------------------------- */
5993
Tim Peters772747b2001-08-09 22:21:55 +00005994PyObject *
5995PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 Py_ssize_t size,
5997 const char *errors,
5998 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Walter Dörwald69652032004-09-07 20:24:22 +00006000 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6001}
6002
6003PyObject *
6004PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 Py_ssize_t size,
6006 const char *errors,
6007 int *byteorder,
6008 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006011 Py_ssize_t startinpos;
6012 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006013 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006014 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006015 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006016 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006017 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 PyObject *errorHandler = NULL;
6019 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006020 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Andy Lestere6be9b52020-02-11 20:28:35 -06006022 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006023 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
6025 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006026 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006028 /* Check for BOM marks (U+FEFF) in the input and adjust current
6029 byte order setting accordingly. In native mode, the leading BOM
6030 mark is skipped, in all other modes, it is copied to the output
6031 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006032 if (bo == 0 && size >= 2) {
6033 const Py_UCS4 bom = (q[1] << 8) | q[0];
6034 if (bom == 0xFEFF) {
6035 q += 2;
6036 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006038 else if (bom == 0xFFFE) {
6039 q += 2;
6040 bo = 1;
6041 }
6042 if (byteorder)
6043 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
Antoine Pitrou63065d72012-05-15 23:48:04 +02006046 if (q == e) {
6047 if (consumed)
6048 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006049 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006050 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006051
Christian Heimes743e0cd2012-10-17 23:52:17 +02006052#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006053 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006054 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006055#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006056 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006057 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006058#endif
Tim Peters772747b2001-08-09 22:21:55 +00006059
Antoine Pitrou63065d72012-05-15 23:48:04 +02006060 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006061 character count normally. Error handler will take care of
6062 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006063 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006064 writer.min_length = (e - q + 1) / 2;
6065 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006066 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006067
Antoine Pitrou63065d72012-05-15 23:48:04 +02006068 while (1) {
6069 Py_UCS4 ch = 0;
6070 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006072 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006073 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006074 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006075 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006076 native_ordering);
6077 else
6078 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006080 native_ordering);
6081 } else if (kind == PyUnicode_2BYTE_KIND) {
6082 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006083 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 native_ordering);
6085 } else {
6086 assert(kind == PyUnicode_4BYTE_KIND);
6087 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006089 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006090 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 switch (ch)
6094 {
6095 case 0:
6096 /* remaining byte at the end? (size should be even) */
6097 if (q == e || consumed)
6098 goto End;
6099 errmsg = "truncated data";
6100 startinpos = ((const char *)q) - starts;
6101 endinpos = ((const char *)e) - starts;
6102 break;
6103 /* The remaining input chars are ignored if the callback
6104 chooses to skip the input */
6105 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006106 q -= 2;
6107 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006108 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006109 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006110 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006111 endinpos = ((const char *)e) - starts;
6112 break;
6113 case 2:
6114 errmsg = "illegal encoding";
6115 startinpos = ((const char *)q) - 2 - starts;
6116 endinpos = startinpos + 2;
6117 break;
6118 case 3:
6119 errmsg = "illegal UTF-16 surrogate";
6120 startinpos = ((const char *)q) - 4 - starts;
6121 endinpos = startinpos + 2;
6122 break;
6123 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006124 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006125 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 continue;
6127 }
6128
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006129 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006130 errors,
6131 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006132 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006133 &starts,
6134 (const char **)&e,
6135 &startinpos,
6136 &endinpos,
6137 &exc,
6138 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006139 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
6142
Antoine Pitrou63065d72012-05-15 23:48:04 +02006143End:
Walter Dörwald69652032004-09-07 20:24:22 +00006144 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006149 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006152 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153 Py_XDECREF(errorHandler);
6154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 return NULL;
6156}
6157
Tim Peters772747b2001-08-09 22:21:55 +00006158PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159_PyUnicode_EncodeUTF16(PyObject *str,
6160 const char *errors,
6161 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006163 enum PyUnicode_Kind kind;
6164 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006166 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006167 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006168 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006169#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006170 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006171#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006172 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006173#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006174 const char *encoding;
6175 Py_ssize_t nsize, pos;
6176 PyObject *errorHandler = NULL;
6177 PyObject *exc = NULL;
6178 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 if (!PyUnicode_Check(str)) {
6181 PyErr_BadArgument();
6182 return NULL;
6183 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006184 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 return NULL;
6186 kind = PyUnicode_KIND(str);
6187 data = PyUnicode_DATA(str);
6188 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006189
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006191 if (kind == PyUnicode_4BYTE_KIND) {
6192 const Py_UCS4 *in = (const Py_UCS4 *)data;
6193 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006194 while (in < end) {
6195 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006197 }
6198 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006199 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006200 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006202 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006203 nsize = len + pairs + (byteorder == 0);
6204 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006205 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006209 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006210 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006211 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006212 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006213 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006214 }
6215 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006217 }
Tim Peters772747b2001-08-09 22:21:55 +00006218
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006219 if (kind == PyUnicode_1BYTE_KIND) {
6220 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6221 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006222 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006223
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006224 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006225 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006226 }
6227 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006228 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006229 }
6230 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006231 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006232 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006233
6234 pos = 0;
6235 while (pos < len) {
6236 Py_ssize_t repsize, moreunits;
6237
6238 if (kind == PyUnicode_2BYTE_KIND) {
6239 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6240 &out, native_ordering);
6241 }
6242 else {
6243 assert(kind == PyUnicode_4BYTE_KIND);
6244 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6245 &out, native_ordering);
6246 }
6247 if (pos == len)
6248 break;
6249
6250 rep = unicode_encode_call_errorhandler(
6251 errors, &errorHandler,
6252 encoding, "surrogates not allowed",
6253 str, &exc, pos, pos + 1, &pos);
6254 if (!rep)
6255 goto error;
6256
6257 if (PyBytes_Check(rep)) {
6258 repsize = PyBytes_GET_SIZE(rep);
6259 if (repsize & 1) {
6260 raise_encode_exception(&exc, encoding,
6261 str, pos - 1, pos,
6262 "surrogates not allowed");
6263 goto error;
6264 }
6265 moreunits = repsize / 2;
6266 }
6267 else {
6268 assert(PyUnicode_Check(rep));
6269 if (PyUnicode_READY(rep) < 0)
6270 goto error;
6271 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6272 if (!PyUnicode_IS_ASCII(rep)) {
6273 raise_encode_exception(&exc, encoding,
6274 str, pos - 1, pos,
6275 "surrogates not allowed");
6276 goto error;
6277 }
6278 }
6279
6280 /* two bytes are reserved for each surrogate */
6281 if (moreunits > 1) {
6282 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006283 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006284 /* integer overflow */
6285 PyErr_NoMemory();
6286 goto error;
6287 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006288 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006289 goto error;
6290 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6291 }
6292
6293 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006294 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006295 out += moreunits;
6296 } else /* rep is unicode */ {
6297 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6298 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6299 &out, native_ordering);
6300 }
6301
6302 Py_CLEAR(rep);
6303 }
6304
6305 /* Cut back to size actually needed. This is necessary for, for example,
6306 encoding of a string containing isolated surrogates and the 'ignore' handler
6307 is used. */
6308 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6309 if (nsize != PyBytes_GET_SIZE(v))
6310 _PyBytes_Resize(&v, nsize);
6311 Py_XDECREF(errorHandler);
6312 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006313 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006314 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006315 error:
6316 Py_XDECREF(rep);
6317 Py_XDECREF(errorHandler);
6318 Py_XDECREF(exc);
6319 Py_XDECREF(v);
6320 return NULL;
6321#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Alexander Belopolsky40018472011-02-26 01:02:56 +00006324PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6326 Py_ssize_t size,
6327 const char *errors,
6328 int byteorder)
6329{
6330 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006331 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006332 if (tmp == NULL)
6333 return NULL;
6334 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6335 Py_DECREF(tmp);
6336 return result;
6337}
6338
6339PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006340PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
6345/* --- Unicode Escape Codec ----------------------------------------------- */
6346
Victor Stinner47e1afd2020-10-26 16:43:47 +01006347static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006348
Alexander Belopolsky40018472011-02-26 01:02:56 +00006349PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006350_PyUnicode_DecodeUnicodeEscape(const char *s,
6351 Py_ssize_t size,
6352 const char *errors,
6353 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006356 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 PyObject *errorHandler = NULL;
6359 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006360
Eric V. Smith42454af2016-10-31 09:22:08 -04006361 // so we can remember if we've seen an invalid escape char or not
6362 *first_invalid_escape = NULL;
6363
Victor Stinner62ec3312016-09-06 17:04:34 -07006364 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006365 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 }
6367 /* Escaped strings will always be longer than the resulting
6368 Unicode string, so we start with size here and then reduce the
6369 length after conversion to the true value.
6370 (but if the error callback returns a long replacement string
6371 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006372 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 writer.min_length = size;
6374 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006376 }
6377
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 end = s + size;
6379 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 unsigned char c = (unsigned char) *s++;
6381 Py_UCS4 ch;
6382 int count;
6383 Py_ssize_t startinpos;
6384 Py_ssize_t endinpos;
6385 const char *message;
6386
6387#define WRITE_ASCII_CHAR(ch) \
6388 do { \
6389 assert(ch <= 127); \
6390 assert(writer.pos < writer.size); \
6391 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6392 } while(0)
6393
6394#define WRITE_CHAR(ch) \
6395 do { \
6396 if (ch <= writer.maxchar) { \
6397 assert(writer.pos < writer.size); \
6398 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6399 } \
6400 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6401 goto onError; \
6402 } \
6403 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
6405 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 if (c != '\\') {
6407 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 continue;
6409 }
6410
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 if (s >= end) {
6414 message = "\\ at end of string";
6415 goto error;
6416 }
6417 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006420 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 case '\n': continue;
6424 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6425 case '\'': WRITE_ASCII_CHAR('\''); continue;
6426 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6427 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006428 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6430 case 't': WRITE_ASCII_CHAR('\t'); continue;
6431 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6432 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006433 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006435 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006436 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 case '0': case '1': case '2': case '3':
6440 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006442 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 ch = (ch<<3) + *s++ - '0';
6444 if (s < end && '0' <= *s && *s <= '7') {
6445 ch = (ch<<3) + *s++ - '0';
6446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 WRITE_CHAR(ch);
6449 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 /* hex escapes */
6452 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006455 message = "truncated \\xXX escape";
6456 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006461 message = "truncated \\uXXXX escape";
6462 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006465 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006467 message = "truncated \\UXXXXXXXX escape";
6468 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006470 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 ch <<= 4;
6472 if (c >= '0' && c <= '9') {
6473 ch += c - '0';
6474 }
6475 else if (c >= 'a' && c <= 'f') {
6476 ch += c - ('a' - 10);
6477 }
6478 else if (c >= 'A' && c <= 'F') {
6479 ch += c - ('A' - 10);
6480 }
6481 else {
6482 break;
6483 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006484 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006486 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 }
6488
6489 /* when we get here, ch is a 32-bit unicode character */
6490 if (ch > MAX_UNICODE) {
6491 message = "illegal Unicode character";
6492 goto error;
6493 }
6494
6495 WRITE_CHAR(ch);
6496 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006497
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006499 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006500 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006501 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006502 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006503 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006504 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 PyErr_SetString(
6506 PyExc_UnicodeError,
6507 "\\N escapes not supported (can't load unicodedata module)"
6508 );
6509 goto onError;
6510 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006511 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006512
6513 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006514 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 const char *start = ++s;
6516 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006517 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006518 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006519 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 namelen = s - start;
6521 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006522 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006523 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006524 ch = 0xffffffff; /* in case 'getcode' messes up */
6525 if (namelen <= INT_MAX &&
Victor Stinner47e1afd2020-10-26 16:43:47 +01006526 ucnhash_capi->getcode(ucnhash_capi->state, NULL,
6527 start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006528 &ch, 0)) {
6529 assert(ch <= MAX_UNICODE);
6530 WRITE_CHAR(ch);
6531 continue;
6532 }
6533 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006534 }
6535 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006536 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006537
6538 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006539 if (*first_invalid_escape == NULL) {
6540 *first_invalid_escape = s-1; /* Back up one char, since we've
6541 already incremented s. */
6542 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006543 WRITE_ASCII_CHAR('\\');
6544 WRITE_CHAR(c);
6545 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006547
6548 error:
6549 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006550 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006551 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006552 errors, &errorHandler,
6553 "unicodeescape", message,
6554 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006555 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006556 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006557 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006558 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006559
6560#undef WRITE_ASCII_CHAR
6561#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006563
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006564 Py_XDECREF(errorHandler);
6565 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006566 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006567
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006569 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006570 Py_XDECREF(errorHandler);
6571 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 return NULL;
6573}
6574
Eric V. Smith42454af2016-10-31 09:22:08 -04006575PyObject *
6576PyUnicode_DecodeUnicodeEscape(const char *s,
6577 Py_ssize_t size,
6578 const char *errors)
6579{
6580 const char *first_invalid_escape;
6581 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6582 &first_invalid_escape);
6583 if (result == NULL)
6584 return NULL;
6585 if (first_invalid_escape != NULL) {
6586 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6587 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006588 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006589 Py_DECREF(result);
6590 return NULL;
6591 }
6592 }
6593 return result;
6594}
6595
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006596/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597
Alexander Belopolsky40018472011-02-26 01:02:56 +00006598PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006599PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006601 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006602 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006604 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006605 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006606 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607
Ezio Melottie7f90372012-10-05 03:33:31 +03006608 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006609 escape.
6610
Ezio Melottie7f90372012-10-05 03:33:31 +03006611 For UCS1 strings it's '\xxx', 4 bytes per source character.
6612 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6613 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006614 */
6615
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006616 if (!PyUnicode_Check(unicode)) {
6617 PyErr_BadArgument();
6618 return NULL;
6619 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006620 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006621 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006622 }
Victor Stinner358af132015-10-12 22:36:57 +02006623
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006624 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 if (len == 0) {
6626 return PyBytes_FromStringAndSize(NULL, 0);
6627 }
6628
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006629 kind = PyUnicode_KIND(unicode);
6630 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006631 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6632 bytes, and 1 byte characters 4. */
6633 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006634 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006635 return PyErr_NoMemory();
6636 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006637 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006638 if (repr == NULL) {
6639 return NULL;
6640 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006641
Victor Stinner62ec3312016-09-06 17:04:34 -07006642 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006643 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006644 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006645
Victor Stinner62ec3312016-09-06 17:04:34 -07006646 /* U+0000-U+00ff range */
6647 if (ch < 0x100) {
6648 if (ch >= ' ' && ch < 127) {
6649 if (ch != '\\') {
6650 /* Copy printable US ASCII as-is */
6651 *p++ = (char) ch;
6652 }
6653 /* Escape backslashes */
6654 else {
6655 *p++ = '\\';
6656 *p++ = '\\';
6657 }
6658 }
Victor Stinner358af132015-10-12 22:36:57 +02006659
Victor Stinner62ec3312016-09-06 17:04:34 -07006660 /* Map special whitespace to '\t', \n', '\r' */
6661 else if (ch == '\t') {
6662 *p++ = '\\';
6663 *p++ = 't';
6664 }
6665 else if (ch == '\n') {
6666 *p++ = '\\';
6667 *p++ = 'n';
6668 }
6669 else if (ch == '\r') {
6670 *p++ = '\\';
6671 *p++ = 'r';
6672 }
6673
6674 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6675 else {
6676 *p++ = '\\';
6677 *p++ = 'x';
6678 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6679 *p++ = Py_hexdigits[ch & 0x000F];
6680 }
Tim Petersced69f82003-09-16 20:30:58 +00006681 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006682 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 *p++ = '\\';
6685 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006686 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6687 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6688 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6689 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006691 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6692 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006693
Victor Stinner62ec3312016-09-06 17:04:34 -07006694 /* Make sure that the first two digits are zero */
6695 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006696 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006697 *p++ = 'U';
6698 *p++ = '0';
6699 *p++ = '0';
6700 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6701 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6702 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6703 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6704 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6705 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708
Victor Stinner62ec3312016-09-06 17:04:34 -07006709 assert(p - PyBytes_AS_STRING(repr) > 0);
6710 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6711 return NULL;
6712 }
6713 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714}
6715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006717PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6718 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006720 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006721 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006722 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006724 }
6725
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006726 result = PyUnicode_AsUnicodeEscapeString(tmp);
6727 Py_DECREF(tmp);
6728 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729}
6730
6731/* --- Raw Unicode Escape Codec ------------------------------------------- */
6732
Alexander Belopolsky40018472011-02-26 01:02:56 +00006733PyObject *
6734PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006735 Py_ssize_t size,
6736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006739 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 PyObject *errorHandler = NULL;
6742 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006743
Victor Stinner62ec3312016-09-06 17:04:34 -07006744 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006745 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006746 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 /* Escaped strings will always be longer than the resulting
6749 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 length after conversion to the true value. (But decoding error
6751 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006752 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006753 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006754 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6755 goto onError;
6756 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006757
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 end = s + size;
6759 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006760 unsigned char c = (unsigned char) *s++;
6761 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006762 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006763 Py_ssize_t startinpos;
6764 Py_ssize_t endinpos;
6765 const char *message;
6766
6767#define WRITE_CHAR(ch) \
6768 do { \
6769 if (ch <= writer.maxchar) { \
6770 assert(writer.pos < writer.size); \
6771 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6772 } \
6773 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6774 goto onError; \
6775 } \
6776 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006779 if (c != '\\' || s >= end) {
6780 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006783
Victor Stinner62ec3312016-09-06 17:04:34 -07006784 c = (unsigned char) *s++;
6785 if (c == 'u') {
6786 count = 4;
6787 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006789 else if (c == 'U') {
6790 count = 8;
6791 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006792 }
6793 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006794 assert(writer.pos < writer.size);
6795 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6796 WRITE_CHAR(c);
6797 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006798 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006799 startinpos = s - starts - 2;
6800
6801 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6802 for (ch = 0; count && s < end; ++s, --count) {
6803 c = (unsigned char)*s;
6804 ch <<= 4;
6805 if (c >= '0' && c <= '9') {
6806 ch += c - '0';
6807 }
6808 else if (c >= 'a' && c <= 'f') {
6809 ch += c - ('a' - 10);
6810 }
6811 else if (c >= 'A' && c <= 'F') {
6812 ch += c - ('A' - 10);
6813 }
6814 else {
6815 break;
6816 }
6817 }
6818 if (!count) {
6819 if (ch <= MAX_UNICODE) {
6820 WRITE_CHAR(ch);
6821 continue;
6822 }
6823 message = "\\Uxxxxxxxx out of range";
6824 }
6825
6826 endinpos = s-starts;
6827 writer.min_length = end - s + writer.pos;
6828 if (unicode_decode_call_errorhandler_writer(
6829 errors, &errorHandler,
6830 "rawunicodeescape", message,
6831 &starts, &end, &startinpos, &endinpos, &exc, &s,
6832 &writer)) {
6833 goto onError;
6834 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006835 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006836
6837#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006839 Py_XDECREF(errorHandler);
6840 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006841 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006842
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006844 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006845 Py_XDECREF(errorHandler);
6846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849}
6850
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006851
Alexander Belopolsky40018472011-02-26 01:02:56 +00006852PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006853PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854{
Victor Stinner62ec3312016-09-06 17:04:34 -07006855 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006857 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006858 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006859 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006860 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006862 if (!PyUnicode_Check(unicode)) {
6863 PyErr_BadArgument();
6864 return NULL;
6865 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006866 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006867 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006868 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006869 kind = PyUnicode_KIND(unicode);
6870 data = PyUnicode_DATA(unicode);
6871 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006872 if (kind == PyUnicode_1BYTE_KIND) {
6873 return PyBytes_FromStringAndSize(data, len);
6874 }
Victor Stinner0e368262011-11-10 20:12:49 +01006875
Victor Stinner62ec3312016-09-06 17:04:34 -07006876 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6877 bytes, and 1 byte characters 4. */
6878 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006879
Victor Stinner62ec3312016-09-06 17:04:34 -07006880 if (len > PY_SSIZE_T_MAX / expandsize) {
6881 return PyErr_NoMemory();
6882 }
6883 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6884 if (repr == NULL) {
6885 return NULL;
6886 }
6887 if (len == 0) {
6888 return repr;
6889 }
6890
6891 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006892 for (pos = 0; pos < len; pos++) {
6893 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006894
Victor Stinner62ec3312016-09-06 17:04:34 -07006895 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6896 if (ch < 0x100) {
6897 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006898 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006899 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006900 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 *p++ = '\\';
6902 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006903 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6904 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6905 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6906 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006908 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6909 else {
6910 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6911 *p++ = '\\';
6912 *p++ = 'U';
6913 *p++ = '0';
6914 *p++ = '0';
6915 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6916 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6917 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6918 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6919 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6920 *p++ = Py_hexdigits[ch & 15];
6921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006923
Victor Stinner62ec3312016-09-06 17:04:34 -07006924 assert(p > PyBytes_AS_STRING(repr));
6925 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6926 return NULL;
6927 }
6928 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929}
6930
Alexander Belopolsky40018472011-02-26 01:02:56 +00006931PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006932PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6933 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006935 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006936 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006937 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006938 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006939 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6940 Py_DECREF(tmp);
6941 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942}
6943
6944/* --- Latin-1 Codec ------------------------------------------------------ */
6945
Alexander Belopolsky40018472011-02-26 01:02:56 +00006946PyObject *
6947PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006948 Py_ssize_t size,
6949 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006952 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953}
6954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006955/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006956static void
6957make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006958 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006959 PyObject *unicode,
6960 Py_ssize_t startpos, Py_ssize_t endpos,
6961 const char *reason)
6962{
6963 if (*exceptionObject == NULL) {
6964 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006965 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006966 encoding, unicode, startpos, endpos, reason);
6967 }
6968 else {
6969 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6970 goto onError;
6971 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6972 goto onError;
6973 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6974 goto onError;
6975 return;
6976 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006977 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006978 }
6979}
6980
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006981/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006982static void
6983raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006984 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006985 PyObject *unicode,
6986 Py_ssize_t startpos, Py_ssize_t endpos,
6987 const char *reason)
6988{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006989 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006990 encoding, unicode, startpos, endpos, reason);
6991 if (*exceptionObject != NULL)
6992 PyCodec_StrictErrors(*exceptionObject);
6993}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994
6995/* error handling callback helper:
6996 build arguments, call the callback and check the arguments,
6997 put the result into newpos and return the replacement string, which
6998 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006999static PyObject *
7000unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007001 PyObject **errorHandler,
7002 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007003 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007004 Py_ssize_t startpos, Py_ssize_t endpos,
7005 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007007 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007008 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007009 PyObject *restuple;
7010 PyObject *resunicode;
7011
7012 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016 }
7017
Benjamin Petersonbac79492012-01-14 13:34:47 -05007018 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007019 return NULL;
7020 len = PyUnicode_GET_LENGTH(unicode);
7021
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007022 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007024 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007026
Petr Viktorinffd97532020-02-11 17:46:57 +01007027 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007028 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007030 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007031 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 Py_DECREF(restuple);
7033 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007035 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 &resunicode, newpos)) {
7037 Py_DECREF(restuple);
7038 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007039 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007040 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7041 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7042 Py_DECREF(restuple);
7043 return NULL;
7044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007045 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007046 *newpos = len + *newpos;
7047 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007048 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 Py_DECREF(restuple);
7050 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007051 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007052 Py_INCREF(resunicode);
7053 Py_DECREF(restuple);
7054 return resunicode;
7055}
7056
Alexander Belopolsky40018472011-02-26 01:02:56 +00007057static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007058unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007059 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007060 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007062 /* input state */
7063 Py_ssize_t pos=0, size;
7064 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007065 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007066 /* pointer into the output */
7067 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007068 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7069 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007070 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007072 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007073 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007074 /* output object */
7075 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007076
Benjamin Petersonbac79492012-01-14 13:34:47 -05007077 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007078 return NULL;
7079 size = PyUnicode_GET_LENGTH(unicode);
7080 kind = PyUnicode_KIND(unicode);
7081 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007082 /* allocate enough for a simple encoding without
7083 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007084 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007085 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007086
7087 _PyBytesWriter_Init(&writer);
7088 str = _PyBytesWriter_Alloc(&writer, size);
7089 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007090 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007092 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007093 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007096 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007098 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007099 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007100 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007102 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007104 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007105 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007107
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007108 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007110
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007111 /* Only overallocate the buffer if it's not the last write */
7112 writer.overallocate = (collend < size);
7113
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007115 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007116 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007117
7118 switch (error_handler) {
7119 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007120 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007122
7123 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007124 memset(str, '?', collend - collstart);
7125 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007126 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007127 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007128 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 break;
Victor Stinner50149202015-09-22 00:26:54 +02007130
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007131 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007132 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007133 writer.min_size -= (collend - collstart);
7134 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007135 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007136 if (str == NULL)
7137 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007138 pos = collend;
7139 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007140
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007141 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007142 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007143 writer.min_size -= (collend - collstart);
7144 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007145 unicode, collstart, collend);
7146 if (str == NULL)
7147 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007148 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 break;
Victor Stinner50149202015-09-22 00:26:54 +02007150
Victor Stinnerc3713e92015-09-29 12:32:13 +02007151 case _Py_ERROR_SURROGATEESCAPE:
7152 for (i = collstart; i < collend; ++i) {
7153 ch = PyUnicode_READ(kind, data, i);
7154 if (ch < 0xdc80 || 0xdcff < ch) {
7155 /* Not a UTF-8b surrogate */
7156 break;
7157 }
7158 *str++ = (char)(ch - 0xdc00);
7159 ++pos;
7160 }
7161 if (i >= collend)
7162 break;
7163 collstart = pos;
7164 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007165 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007166
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007168 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7169 encoding, reason, unicode, &exc,
7170 collstart, collend, &newpos);
7171 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007173
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007174 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007175 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007176
Victor Stinner6bd525b2015-10-09 13:10:05 +02007177 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007178 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007179 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007180 PyBytes_AS_STRING(rep),
7181 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007182 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007183 else {
7184 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007185
Victor Stinner6bd525b2015-10-09 13:10:05 +02007186 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007188
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007189 if (limit == 256 ?
7190 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7191 !PyUnicode_IS_ASCII(rep))
7192 {
7193 /* Not all characters are smaller than limit */
7194 raise_encode_exception(&exc, encoding, unicode,
7195 collstart, collend, reason);
7196 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007198 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7199 str = _PyBytesWriter_WriteBytes(&writer, str,
7200 PyUnicode_DATA(rep),
7201 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007203 if (str == NULL)
7204 goto onError;
7205
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007206 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007207 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007208 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007209
7210 /* If overallocation was disabled, ensure that it was the last
7211 write. Otherwise, we missed an optimization */
7212 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007213 }
7214 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007215
Victor Stinner50149202015-09-22 00:26:54 +02007216 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007217 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007218 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007219
7220 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007221 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007222 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007223 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007224 Py_XDECREF(exc);
7225 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007226}
7227
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007228/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007229PyObject *
7230PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007231 Py_ssize_t size,
7232 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007234 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007235 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007236 if (unicode == NULL)
7237 return NULL;
7238 result = unicode_encode_ucs1(unicode, errors, 256);
7239 Py_DECREF(unicode);
7240 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241}
7242
Alexander Belopolsky40018472011-02-26 01:02:56 +00007243PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007244_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007245{
7246 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 PyErr_BadArgument();
7248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007250 if (PyUnicode_READY(unicode) == -1)
7251 return NULL;
7252 /* Fast path: if it is a one-byte string, construct
7253 bytes object directly. */
7254 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7255 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7256 PyUnicode_GET_LENGTH(unicode));
7257 /* Non-Latin-1 characters present. Defer to above function to
7258 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007259 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007260}
7261
7262PyObject*
7263PyUnicode_AsLatin1String(PyObject *unicode)
7264{
7265 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266}
7267
7268/* --- 7-bit ASCII Codec -------------------------------------------------- */
7269
Alexander Belopolsky40018472011-02-26 01:02:56 +00007270PyObject *
7271PyUnicode_DecodeASCII(const char *s,
7272 Py_ssize_t size,
7273 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007276 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007277 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007279 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007282 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007283
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007285 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007286 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007288
Inada Naoki770847a2019-06-24 12:30:24 +09007289 // Shortcut for simple case
7290 PyObject *u = PyUnicode_New(size, 127);
7291 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007292 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007293 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007294 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007295 if (outpos == size) {
7296 return u;
7297 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007298
Inada Naoki770847a2019-06-24 12:30:24 +09007299 _PyUnicodeWriter writer;
7300 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007301 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007302
Inada Naoki770847a2019-06-24 12:30:24 +09007303 s += outpos;
7304 int kind = writer.kind;
7305 void *data = writer.data;
7306 Py_ssize_t startinpos, endinpos;
7307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007308 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007309 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007311 PyUnicode_WRITE(kind, data, writer.pos, c);
7312 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007314 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007315 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007316
7317 /* byte outsize range 0x00..0x7f: call the error handler */
7318
7319 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007320 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007321
7322 switch (error_handler)
7323 {
7324 case _Py_ERROR_REPLACE:
7325 case _Py_ERROR_SURROGATEESCAPE:
7326 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007327 but we may switch to UCS2 at the first write */
7328 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7329 goto onError;
7330 kind = writer.kind;
7331 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007332
7333 if (error_handler == _Py_ERROR_REPLACE)
7334 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7335 else
7336 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7337 writer.pos++;
7338 ++s;
7339 break;
7340
7341 case _Py_ERROR_IGNORE:
7342 ++s;
7343 break;
7344
7345 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 startinpos = s-starts;
7347 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007348 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007349 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 "ascii", "ordinal not in range(128)",
7351 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007352 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007354 kind = writer.kind;
7355 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007358 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007359 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007360 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007361
Benjamin Peterson29060642009-01-31 22:14:21 +00007362 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007363 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007364 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007365 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007366 return NULL;
7367}
7368
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007369/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007370PyObject *
7371PyUnicode_EncodeASCII(const Py_UNICODE *p,
7372 Py_ssize_t size,
7373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007374{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007375 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007376 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007377 if (unicode == NULL)
7378 return NULL;
7379 result = unicode_encode_ucs1(unicode, errors, 128);
7380 Py_DECREF(unicode);
7381 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382}
7383
Alexander Belopolsky40018472011-02-26 01:02:56 +00007384PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007385_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386{
7387 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 PyErr_BadArgument();
7389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007391 if (PyUnicode_READY(unicode) == -1)
7392 return NULL;
7393 /* Fast path: if it is an ASCII-only string, construct bytes object
7394 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007395 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007396 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7397 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007398 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007399}
7400
7401PyObject *
7402PyUnicode_AsASCIIString(PyObject *unicode)
7403{
7404 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405}
7406
Steve Dowercc16be82016-09-08 10:35:16 -07007407#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007408
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007409/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007410
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007411#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007412#define NEED_RETRY
7413#endif
7414
Steve Dower7ebdda02019-08-21 16:22:33 -07007415/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7416 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7417 both cases also and avoids partial characters overrunning the
7418 length limit in MultiByteToWideChar on Windows */
7419#define DECODING_CHUNK_SIZE (INT_MAX/4)
7420
Victor Stinner3a50e702011-10-18 21:21:00 +02007421#ifndef WC_ERR_INVALID_CHARS
7422# define WC_ERR_INVALID_CHARS 0x0080
7423#endif
7424
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007425static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007426code_page_name(UINT code_page, PyObject **obj)
7427{
7428 *obj = NULL;
7429 if (code_page == CP_ACP)
7430 return "mbcs";
7431 if (code_page == CP_UTF7)
7432 return "CP_UTF7";
7433 if (code_page == CP_UTF8)
7434 return "CP_UTF8";
7435
7436 *obj = PyBytes_FromFormat("cp%u", code_page);
7437 if (*obj == NULL)
7438 return NULL;
7439 return PyBytes_AS_STRING(*obj);
7440}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007441
Victor Stinner3a50e702011-10-18 21:21:00 +02007442static DWORD
7443decode_code_page_flags(UINT code_page)
7444{
7445 if (code_page == CP_UTF7) {
7446 /* The CP_UTF7 decoder only supports flags=0 */
7447 return 0;
7448 }
7449 else
7450 return MB_ERR_INVALID_CHARS;
7451}
7452
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 * Decode a byte string from a Windows code page into unicode object in strict
7455 * mode.
7456 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007457 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7458 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007459 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007460static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007461decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007462 wchar_t **buf,
7463 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 const char *in,
7465 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007466{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007467 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007468 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470
7471 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007473 while ((outsize = MultiByteToWideChar(code_page, flags,
7474 in, insize, NULL, 0)) <= 0)
7475 {
7476 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7477 goto error;
7478 }
7479 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7480 flags = 0;
7481 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007483 /* Extend a wchar_t* buffer */
7484 Py_ssize_t n = *bufsize; /* Get the current length */
7485 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7486 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007487 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007488 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489
7490 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7492 if (outsize <= 0)
7493 goto error;
7494 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496error:
7497 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7498 return -2;
7499 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007500 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007501}
7502
Victor Stinner3a50e702011-10-18 21:21:00 +02007503/*
7504 * Decode a byte string from a code page into unicode object with an error
7505 * handler.
7506 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007507 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 * UnicodeDecodeError exception and returns -1 on error.
7509 */
7510static int
7511decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007512 wchar_t **buf,
7513 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007514 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007515 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007516{
7517 const char *startin = in;
7518 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007519 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 /* Ideally, we should get reason from FormatMessage. This is the Windows
7521 2000 English version of the message. */
7522 const char *reason = "No mapping for the Unicode character exists "
7523 "in the target code page.";
7524 /* each step cannot decode more than 1 character, but a character can be
7525 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007526 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007527 int insize;
7528 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 PyObject *errorHandler = NULL;
7530 PyObject *exc = NULL;
7531 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007532 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 DWORD err;
7534 int ret = -1;
7535
7536 assert(size > 0);
7537
7538 encoding = code_page_name(code_page, &encoding_obj);
7539 if (encoding == NULL)
7540 return -1;
7541
Victor Stinner7d00cc12014-03-17 23:08:06 +01007542 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7544 UnicodeDecodeError. */
7545 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7546 if (exc != NULL) {
7547 PyCodec_StrictErrors(exc);
7548 Py_CLEAR(exc);
7549 }
7550 goto error;
7551 }
7552
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007553 /* Extend a wchar_t* buffer */
7554 Py_ssize_t n = *bufsize; /* Get the current length */
7555 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7556 PyErr_NoMemory();
7557 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007559 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7560 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007562 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007563
7564 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 while (in < endin)
7566 {
7567 /* Decode a character */
7568 insize = 1;
7569 do
7570 {
7571 outsize = MultiByteToWideChar(code_page, flags,
7572 in, insize,
7573 buffer, Py_ARRAY_LENGTH(buffer));
7574 if (outsize > 0)
7575 break;
7576 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007577 if (err == ERROR_INVALID_FLAGS && flags) {
7578 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7579 flags = 0;
7580 continue;
7581 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 if (err != ERROR_NO_UNICODE_TRANSLATION
7583 && err != ERROR_INSUFFICIENT_BUFFER)
7584 {
7585 PyErr_SetFromWindowsErr(0);
7586 goto error;
7587 }
7588 insize++;
7589 }
7590 /* 4=maximum length of a UTF-8 sequence */
7591 while (insize <= 4 && (in + insize) <= endin);
7592
7593 if (outsize <= 0) {
7594 Py_ssize_t startinpos, endinpos, outpos;
7595
Victor Stinner7d00cc12014-03-17 23:08:06 +01007596 /* last character in partial decode? */
7597 if (in + insize >= endin && !final)
7598 break;
7599
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 startinpos = in - startin;
7601 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007602 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007603 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 errors, &errorHandler,
7605 encoding, reason,
7606 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007607 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 {
7609 goto error;
7610 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007611 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 }
7613 else {
7614 in += insize;
7615 memcpy(out, buffer, outsize * sizeof(wchar_t));
7616 out += outsize;
7617 }
7618 }
7619
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007620 /* Shrink the buffer */
7621 assert(out - *buf <= *bufsize);
7622 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007623 /* (in - startin) <= size and size is an int */
7624 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007625
7626error:
7627 Py_XDECREF(encoding_obj);
7628 Py_XDECREF(errorHandler);
7629 Py_XDECREF(exc);
7630 return ret;
7631}
7632
Victor Stinner3a50e702011-10-18 21:21:00 +02007633static PyObject *
7634decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007635 const char *s, Py_ssize_t size,
7636 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007637{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007638 wchar_t *buf = NULL;
7639 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007640 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007641
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 if (code_page < 0) {
7643 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7644 return NULL;
7645 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007646 if (size < 0) {
7647 PyErr_BadInternalCall();
7648 return NULL;
7649 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007650
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007651 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007653
Victor Stinner76a31a62011-11-04 00:05:13 +01007654 do
7655 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007656#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007657 if (size > DECODING_CHUNK_SIZE) {
7658 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 final = 0;
7660 done = 0;
7661 }
7662 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007663#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007664 {
7665 chunk_size = (int)size;
7666 final = (consumed == NULL);
7667 done = 1;
7668 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007669
Victor Stinner76a31a62011-11-04 00:05:13 +01007670 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007671 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007672 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007673 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007674 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007675
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007676 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007677 s, chunk_size);
7678 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007679 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007680 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007681 errors, final);
7682 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007683
7684 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007685 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007686 return NULL;
7687 }
7688
7689 if (consumed)
7690 *consumed += converted;
7691
7692 s += converted;
7693 size -= converted;
7694 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007695
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007696 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7697 PyMem_Free(buf);
7698 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007699}
7700
Alexander Belopolsky40018472011-02-26 01:02:56 +00007701PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007702PyUnicode_DecodeCodePageStateful(int code_page,
7703 const char *s,
7704 Py_ssize_t size,
7705 const char *errors,
7706 Py_ssize_t *consumed)
7707{
7708 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7709}
7710
7711PyObject *
7712PyUnicode_DecodeMBCSStateful(const char *s,
7713 Py_ssize_t size,
7714 const char *errors,
7715 Py_ssize_t *consumed)
7716{
7717 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7718}
7719
7720PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007721PyUnicode_DecodeMBCS(const char *s,
7722 Py_ssize_t size,
7723 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007724{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007725 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7726}
7727
Victor Stinner3a50e702011-10-18 21:21:00 +02007728static DWORD
7729encode_code_page_flags(UINT code_page, const char *errors)
7730{
7731 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007732 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007733 }
7734 else if (code_page == CP_UTF7) {
7735 /* CP_UTF7 only supports flags=0 */
7736 return 0;
7737 }
7738 else {
7739 if (errors != NULL && strcmp(errors, "replace") == 0)
7740 return 0;
7741 else
7742 return WC_NO_BEST_FIT_CHARS;
7743 }
7744}
7745
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007746/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007747 * Encode a Unicode string to a Windows code page into a byte string in strict
7748 * mode.
7749 *
7750 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007751 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007752 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007754encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007755 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007756 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007757{
Victor Stinner554f3f02010-06-16 23:33:54 +00007758 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007759 BOOL *pusedDefaultChar = &usedDefaultChar;
7760 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007761 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007762 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 const DWORD flags = encode_code_page_flags(code_page, NULL);
7764 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007765 /* Create a substring so that we can get the UTF-16 representation
7766 of just the slice under consideration. */
7767 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007768 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007769
Martin v. Löwis3d325192011-11-04 18:23:06 +01007770 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007771
Victor Stinner3a50e702011-10-18 21:21:00 +02007772 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007773 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007774 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007775 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007776
Victor Stinner2fc507f2011-11-04 20:06:39 +01007777 substring = PyUnicode_Substring(unicode, offset, offset+len);
7778 if (substring == NULL)
7779 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007780#if USE_UNICODE_WCHAR_CACHE
7781_Py_COMP_DIAG_PUSH
7782_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007783 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7784 if (p == NULL) {
7785 Py_DECREF(substring);
7786 return -1;
7787 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007788_Py_COMP_DIAG_POP
7789#else /* USE_UNICODE_WCHAR_CACHE */
7790 p = PyUnicode_AsWideCharString(substring, &size);
7791 Py_CLEAR(substring);
7792 if (p == NULL) {
7793 return -1;
7794 }
7795#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007796 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007797
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007798 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007799 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007800 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007801 NULL, 0,
7802 NULL, pusedDefaultChar);
7803 if (outsize <= 0)
7804 goto error;
7805 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007806 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007807 ret = -2;
7808 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007809 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007810
Victor Stinner3a50e702011-10-18 21:21:00 +02007811 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007813 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007814 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007815 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007816 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007817 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007818 }
7819 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007821 const Py_ssize_t n = PyBytes_Size(*outbytes);
7822 if (outsize > PY_SSIZE_T_MAX - n) {
7823 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007824 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007825 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007826 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007827 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007828 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007829 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007830 }
7831
7832 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007833 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007834 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007835 out, outsize,
7836 NULL, pusedDefaultChar);
7837 if (outsize <= 0)
7838 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007839 if (pusedDefaultChar && *pusedDefaultChar) {
7840 ret = -2;
7841 goto done;
7842 }
7843 ret = 0;
7844
7845done:
7846#if USE_UNICODE_WCHAR_CACHE
7847 Py_DECREF(substring);
7848#else /* USE_UNICODE_WCHAR_CACHE */
7849 PyMem_Free(p);
7850#endif /* USE_UNICODE_WCHAR_CACHE */
7851 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007852
Victor Stinner3a50e702011-10-18 21:21:00 +02007853error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007854 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7855 ret = -2;
7856 goto done;
7857 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007858 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007859 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007860}
7861
Victor Stinner3a50e702011-10-18 21:21:00 +02007862/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007863 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007864 * error handler.
7865 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007866 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007867 * -1 on other error.
7868 */
7869static int
7870encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007871 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007872 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007873{
Victor Stinner3a50e702011-10-18 21:21:00 +02007874 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007875 Py_ssize_t pos = unicode_offset;
7876 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007877 /* Ideally, we should get reason from FormatMessage. This is the Windows
7878 2000 English version of the message. */
7879 const char *reason = "invalid character";
7880 /* 4=maximum length of a UTF-8 sequence */
7881 char buffer[4];
7882 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7883 Py_ssize_t outsize;
7884 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007885 PyObject *errorHandler = NULL;
7886 PyObject *exc = NULL;
7887 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007888 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007889 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007890 PyObject *rep;
7891 int ret = -1;
7892
7893 assert(insize > 0);
7894
7895 encoding = code_page_name(code_page, &encoding_obj);
7896 if (encoding == NULL)
7897 return -1;
7898
7899 if (errors == NULL || strcmp(errors, "strict") == 0) {
7900 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7901 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007902 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007903 if (exc != NULL) {
7904 PyCodec_StrictErrors(exc);
7905 Py_DECREF(exc);
7906 }
7907 Py_XDECREF(encoding_obj);
7908 return -1;
7909 }
7910
7911 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7912 pusedDefaultChar = &usedDefaultChar;
7913 else
7914 pusedDefaultChar = NULL;
7915
7916 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7917 PyErr_NoMemory();
7918 goto error;
7919 }
7920 outsize = insize * Py_ARRAY_LENGTH(buffer);
7921
7922 if (*outbytes == NULL) {
7923 /* Create string object */
7924 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7925 if (*outbytes == NULL)
7926 goto error;
7927 out = PyBytes_AS_STRING(*outbytes);
7928 }
7929 else {
7930 /* Extend string object */
7931 Py_ssize_t n = PyBytes_Size(*outbytes);
7932 if (n > PY_SSIZE_T_MAX - outsize) {
7933 PyErr_NoMemory();
7934 goto error;
7935 }
7936 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7937 goto error;
7938 out = PyBytes_AS_STRING(*outbytes) + n;
7939 }
7940
7941 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007942 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007943 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007944 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7945 wchar_t chars[2];
7946 int charsize;
7947 if (ch < 0x10000) {
7948 chars[0] = (wchar_t)ch;
7949 charsize = 1;
7950 }
7951 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007952 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7953 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007954 charsize = 2;
7955 }
7956
Victor Stinner3a50e702011-10-18 21:21:00 +02007957 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007958 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007959 buffer, Py_ARRAY_LENGTH(buffer),
7960 NULL, pusedDefaultChar);
7961 if (outsize > 0) {
7962 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7963 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007964 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007965 memcpy(out, buffer, outsize);
7966 out += outsize;
7967 continue;
7968 }
7969 }
7970 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7971 PyErr_SetFromWindowsErr(0);
7972 goto error;
7973 }
7974
Victor Stinner3a50e702011-10-18 21:21:00 +02007975 rep = unicode_encode_call_errorhandler(
7976 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007977 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007978 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007979 if (rep == NULL)
7980 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007981 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007982
7983 if (PyBytes_Check(rep)) {
7984 outsize = PyBytes_GET_SIZE(rep);
7985 if (outsize != 1) {
7986 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7987 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7988 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7989 Py_DECREF(rep);
7990 goto error;
7991 }
7992 out = PyBytes_AS_STRING(*outbytes) + offset;
7993 }
7994 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7995 out += outsize;
7996 }
7997 else {
7998 Py_ssize_t i;
7999 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008000 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008001
Benjamin Petersonbac79492012-01-14 13:34:47 -05008002 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008003 Py_DECREF(rep);
8004 goto error;
8005 }
8006
8007 outsize = PyUnicode_GET_LENGTH(rep);
8008 if (outsize != 1) {
8009 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8010 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8011 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8012 Py_DECREF(rep);
8013 goto error;
8014 }
8015 out = PyBytes_AS_STRING(*outbytes) + offset;
8016 }
8017 kind = PyUnicode_KIND(rep);
8018 data = PyUnicode_DATA(rep);
8019 for (i=0; i < outsize; i++) {
8020 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8021 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008022 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008023 encoding, unicode,
8024 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008025 "unable to encode error handler result to ASCII");
8026 Py_DECREF(rep);
8027 goto error;
8028 }
8029 *out = (unsigned char)ch;
8030 out++;
8031 }
8032 }
8033 Py_DECREF(rep);
8034 }
8035 /* write a NUL byte */
8036 *out = 0;
8037 outsize = out - PyBytes_AS_STRING(*outbytes);
8038 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8039 if (_PyBytes_Resize(outbytes, outsize) < 0)
8040 goto error;
8041 ret = 0;
8042
8043error:
8044 Py_XDECREF(encoding_obj);
8045 Py_XDECREF(errorHandler);
8046 Py_XDECREF(exc);
8047 return ret;
8048}
8049
Victor Stinner3a50e702011-10-18 21:21:00 +02008050static PyObject *
8051encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008052 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008053 const char *errors)
8054{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008055 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008056 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008057 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008058 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008059
Victor Stinner29dacf22015-01-26 16:41:32 +01008060 if (!PyUnicode_Check(unicode)) {
8061 PyErr_BadArgument();
8062 return NULL;
8063 }
8064
Benjamin Petersonbac79492012-01-14 13:34:47 -05008065 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008066 return NULL;
8067 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008068
Victor Stinner3a50e702011-10-18 21:21:00 +02008069 if (code_page < 0) {
8070 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8071 return NULL;
8072 }
8073
Martin v. Löwis3d325192011-11-04 18:23:06 +01008074 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008075 return PyBytes_FromStringAndSize(NULL, 0);
8076
Victor Stinner7581cef2011-11-03 22:32:33 +01008077 offset = 0;
8078 do
8079 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008080#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008081 if (len > DECODING_CHUNK_SIZE) {
8082 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008083 done = 0;
8084 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008085 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008086#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008087 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008088 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008089 done = 1;
8090 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008091
Victor Stinner76a31a62011-11-04 00:05:13 +01008092 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008093 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008094 errors);
8095 if (ret == -2)
8096 ret = encode_code_page_errors(code_page, &outbytes,
8097 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008098 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008099 if (ret < 0) {
8100 Py_XDECREF(outbytes);
8101 return NULL;
8102 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008103
Victor Stinner7581cef2011-11-03 22:32:33 +01008104 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008105 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008106 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008107
Victor Stinner3a50e702011-10-18 21:21:00 +02008108 return outbytes;
8109}
8110
8111PyObject *
8112PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8113 Py_ssize_t size,
8114 const char *errors)
8115{
Victor Stinner7581cef2011-11-03 22:32:33 +01008116 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008117 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008118 if (unicode == NULL)
8119 return NULL;
8120 res = encode_code_page(CP_ACP, unicode, errors);
8121 Py_DECREF(unicode);
8122 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008123}
8124
8125PyObject *
8126PyUnicode_EncodeCodePage(int code_page,
8127 PyObject *unicode,
8128 const char *errors)
8129{
Victor Stinner7581cef2011-11-03 22:32:33 +01008130 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008131}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008132
Alexander Belopolsky40018472011-02-26 01:02:56 +00008133PyObject *
8134PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008135{
Victor Stinner7581cef2011-11-03 22:32:33 +01008136 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008137}
8138
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008139#undef NEED_RETRY
8140
Steve Dowercc16be82016-09-08 10:35:16 -07008141#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008142
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143/* --- Character Mapping Codec -------------------------------------------- */
8144
Victor Stinnerfb161b12013-04-18 01:44:27 +02008145static int
8146charmap_decode_string(const char *s,
8147 Py_ssize_t size,
8148 PyObject *mapping,
8149 const char *errors,
8150 _PyUnicodeWriter *writer)
8151{
8152 const char *starts = s;
8153 const char *e;
8154 Py_ssize_t startinpos, endinpos;
8155 PyObject *errorHandler = NULL, *exc = NULL;
8156 Py_ssize_t maplen;
8157 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008158 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008159 Py_UCS4 x;
8160 unsigned char ch;
8161
8162 if (PyUnicode_READY(mapping) == -1)
8163 return -1;
8164
8165 maplen = PyUnicode_GET_LENGTH(mapping);
8166 mapdata = PyUnicode_DATA(mapping);
8167 mapkind = PyUnicode_KIND(mapping);
8168
8169 e = s + size;
8170
8171 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8172 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8173 * is disabled in encoding aliases, latin1 is preferred because
8174 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008175 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008176 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8177 Py_UCS4 maxchar = writer->maxchar;
8178
8179 assert (writer->kind == PyUnicode_1BYTE_KIND);
8180 while (s < e) {
8181 ch = *s;
8182 x = mapdata_ucs1[ch];
8183 if (x > maxchar) {
8184 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8185 goto onError;
8186 maxchar = writer->maxchar;
8187 outdata = (Py_UCS1 *)writer->data;
8188 }
8189 outdata[writer->pos] = x;
8190 writer->pos++;
8191 ++s;
8192 }
8193 return 0;
8194 }
8195
8196 while (s < e) {
8197 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8198 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008199 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008200 if (outkind == PyUnicode_1BYTE_KIND) {
8201 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8202 Py_UCS4 maxchar = writer->maxchar;
8203 while (s < e) {
8204 ch = *s;
8205 x = mapdata_ucs2[ch];
8206 if (x > maxchar)
8207 goto Error;
8208 outdata[writer->pos] = x;
8209 writer->pos++;
8210 ++s;
8211 }
8212 break;
8213 }
8214 else if (outkind == PyUnicode_2BYTE_KIND) {
8215 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8216 while (s < e) {
8217 ch = *s;
8218 x = mapdata_ucs2[ch];
8219 if (x == 0xFFFE)
8220 goto Error;
8221 outdata[writer->pos] = x;
8222 writer->pos++;
8223 ++s;
8224 }
8225 break;
8226 }
8227 }
8228 ch = *s;
8229
8230 if (ch < maplen)
8231 x = PyUnicode_READ(mapkind, mapdata, ch);
8232 else
8233 x = 0xfffe; /* invalid value */
8234Error:
8235 if (x == 0xfffe)
8236 {
8237 /* undefined mapping */
8238 startinpos = s-starts;
8239 endinpos = startinpos+1;
8240 if (unicode_decode_call_errorhandler_writer(
8241 errors, &errorHandler,
8242 "charmap", "character maps to <undefined>",
8243 &starts, &e, &startinpos, &endinpos, &exc, &s,
8244 writer)) {
8245 goto onError;
8246 }
8247 continue;
8248 }
8249
8250 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8251 goto onError;
8252 ++s;
8253 }
8254 Py_XDECREF(errorHandler);
8255 Py_XDECREF(exc);
8256 return 0;
8257
8258onError:
8259 Py_XDECREF(errorHandler);
8260 Py_XDECREF(exc);
8261 return -1;
8262}
8263
8264static int
8265charmap_decode_mapping(const char *s,
8266 Py_ssize_t size,
8267 PyObject *mapping,
8268 const char *errors,
8269 _PyUnicodeWriter *writer)
8270{
8271 const char *starts = s;
8272 const char *e;
8273 Py_ssize_t startinpos, endinpos;
8274 PyObject *errorHandler = NULL, *exc = NULL;
8275 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008276 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008277
8278 e = s + size;
8279
8280 while (s < e) {
8281 ch = *s;
8282
8283 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8284 key = PyLong_FromLong((long)ch);
8285 if (key == NULL)
8286 goto onError;
8287
8288 item = PyObject_GetItem(mapping, key);
8289 Py_DECREF(key);
8290 if (item == NULL) {
8291 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8292 /* No mapping found means: mapping is undefined. */
8293 PyErr_Clear();
8294 goto Undefined;
8295 } else
8296 goto onError;
8297 }
8298
8299 /* Apply mapping */
8300 if (item == Py_None)
8301 goto Undefined;
8302 if (PyLong_Check(item)) {
8303 long value = PyLong_AS_LONG(item);
8304 if (value == 0xFFFE)
8305 goto Undefined;
8306 if (value < 0 || value > MAX_UNICODE) {
8307 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008308 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008309 (unsigned long)MAX_UNICODE + 1);
8310 goto onError;
8311 }
8312
8313 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8314 goto onError;
8315 }
8316 else if (PyUnicode_Check(item)) {
8317 if (PyUnicode_READY(item) == -1)
8318 goto onError;
8319 if (PyUnicode_GET_LENGTH(item) == 1) {
8320 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8321 if (value == 0xFFFE)
8322 goto Undefined;
8323 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8324 goto onError;
8325 }
8326 else {
8327 writer->overallocate = 1;
8328 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8329 goto onError;
8330 }
8331 }
8332 else {
8333 /* wrong return value */
8334 PyErr_SetString(PyExc_TypeError,
8335 "character mapping must return integer, None or str");
8336 goto onError;
8337 }
8338 Py_CLEAR(item);
8339 ++s;
8340 continue;
8341
8342Undefined:
8343 /* undefined mapping */
8344 Py_CLEAR(item);
8345 startinpos = s-starts;
8346 endinpos = startinpos+1;
8347 if (unicode_decode_call_errorhandler_writer(
8348 errors, &errorHandler,
8349 "charmap", "character maps to <undefined>",
8350 &starts, &e, &startinpos, &endinpos, &exc, &s,
8351 writer)) {
8352 goto onError;
8353 }
8354 }
8355 Py_XDECREF(errorHandler);
8356 Py_XDECREF(exc);
8357 return 0;
8358
8359onError:
8360 Py_XDECREF(item);
8361 Py_XDECREF(errorHandler);
8362 Py_XDECREF(exc);
8363 return -1;
8364}
8365
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366PyObject *
8367PyUnicode_DecodeCharmap(const char *s,
8368 Py_ssize_t size,
8369 PyObject *mapping,
8370 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008372 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008373
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 /* Default to Latin-1 */
8375 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008379 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008380 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008381 writer.min_length = size;
8382 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008384
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008385 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008386 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8387 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008388 }
8389 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008390 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8391 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008393 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008394
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008396 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 return NULL;
8398}
8399
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008400/* Charmap encoding: the lookup table */
8401
Alexander Belopolsky40018472011-02-26 01:02:56 +00008402struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 PyObject_HEAD
8404 unsigned char level1[32];
8405 int count2, count3;
8406 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008407};
8408
8409static PyObject*
8410encoding_map_size(PyObject *obj, PyObject* args)
8411{
8412 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008415}
8416
8417static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 PyDoc_STR("Return the size (in bytes) of this object") },
8420 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008421};
8422
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008423static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 "EncodingMap", /*tp_name*/
8426 sizeof(struct encoding_map), /*tp_basicsize*/
8427 0, /*tp_itemsize*/
8428 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008429 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008430 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 0, /*tp_getattr*/
8432 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008433 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 0, /*tp_repr*/
8435 0, /*tp_as_number*/
8436 0, /*tp_as_sequence*/
8437 0, /*tp_as_mapping*/
8438 0, /*tp_hash*/
8439 0, /*tp_call*/
8440 0, /*tp_str*/
8441 0, /*tp_getattro*/
8442 0, /*tp_setattro*/
8443 0, /*tp_as_buffer*/
8444 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8445 0, /*tp_doc*/
8446 0, /*tp_traverse*/
8447 0, /*tp_clear*/
8448 0, /*tp_richcompare*/
8449 0, /*tp_weaklistoffset*/
8450 0, /*tp_iter*/
8451 0, /*tp_iternext*/
8452 encoding_map_methods, /*tp_methods*/
8453 0, /*tp_members*/
8454 0, /*tp_getset*/
8455 0, /*tp_base*/
8456 0, /*tp_dict*/
8457 0, /*tp_descr_get*/
8458 0, /*tp_descr_set*/
8459 0, /*tp_dictoffset*/
8460 0, /*tp_init*/
8461 0, /*tp_alloc*/
8462 0, /*tp_new*/
8463 0, /*tp_free*/
8464 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008465};
8466
8467PyObject*
8468PyUnicode_BuildEncodingMap(PyObject* string)
8469{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008470 PyObject *result;
8471 struct encoding_map *mresult;
8472 int i;
8473 int need_dict = 0;
8474 unsigned char level1[32];
8475 unsigned char level2[512];
8476 unsigned char *mlevel1, *mlevel2, *mlevel3;
8477 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008479 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008480 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008482
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008483 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008484 PyErr_BadArgument();
8485 return NULL;
8486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 kind = PyUnicode_KIND(string);
8488 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008489 length = PyUnicode_GET_LENGTH(string);
8490 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008491 memset(level1, 0xFF, sizeof level1);
8492 memset(level2, 0xFF, sizeof level2);
8493
8494 /* If there isn't a one-to-one mapping of NULL to \0,
8495 or if there are non-BMP characters, we need to use
8496 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008498 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008499 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008500 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 ch = PyUnicode_READ(kind, data, i);
8502 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008503 need_dict = 1;
8504 break;
8505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008507 /* unmapped character */
8508 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 l1 = ch >> 11;
8510 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008511 if (level1[l1] == 0xFF)
8512 level1[l1] = count2++;
8513 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008515 }
8516
8517 if (count2 >= 0xFF || count3 >= 0xFF)
8518 need_dict = 1;
8519
8520 if (need_dict) {
8521 PyObject *result = PyDict_New();
8522 PyObject *key, *value;
8523 if (!result)
8524 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008525 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008527 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008528 if (!key || !value)
8529 goto failed1;
8530 if (PyDict_SetItem(result, key, value) == -1)
8531 goto failed1;
8532 Py_DECREF(key);
8533 Py_DECREF(value);
8534 }
8535 return result;
8536 failed1:
8537 Py_XDECREF(key);
8538 Py_XDECREF(value);
8539 Py_DECREF(result);
8540 return NULL;
8541 }
8542
8543 /* Create a three-level trie */
8544 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8545 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008546 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008547 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008548 }
8549
8550 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008551 mresult = (struct encoding_map*)result;
8552 mresult->count2 = count2;
8553 mresult->count3 = count3;
8554 mlevel1 = mresult->level1;
8555 mlevel2 = mresult->level23;
8556 mlevel3 = mresult->level23 + 16*count2;
8557 memcpy(mlevel1, level1, 32);
8558 memset(mlevel2, 0xFF, 16*count2);
8559 memset(mlevel3, 0, 128*count3);
8560 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008561 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008562 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008563 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8564 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008565 /* unmapped character */
8566 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008567 o1 = ch>>11;
8568 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008569 i2 = 16*mlevel1[o1] + o2;
8570 if (mlevel2[i2] == 0xFF)
8571 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008572 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008573 i3 = 128*mlevel2[i2] + o3;
8574 mlevel3[i3] = i;
8575 }
8576 return result;
8577}
8578
8579static int
Victor Stinner22168992011-11-20 17:09:18 +01008580encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008581{
8582 struct encoding_map *map = (struct encoding_map*)mapping;
8583 int l1 = c>>11;
8584 int l2 = (c>>7) & 0xF;
8585 int l3 = c & 0x7F;
8586 int i;
8587
Victor Stinner22168992011-11-20 17:09:18 +01008588 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008590 if (c == 0)
8591 return 0;
8592 /* level 1*/
8593 i = map->level1[l1];
8594 if (i == 0xFF) {
8595 return -1;
8596 }
8597 /* level 2*/
8598 i = map->level23[16*i+l2];
8599 if (i == 0xFF) {
8600 return -1;
8601 }
8602 /* level 3 */
8603 i = map->level23[16*map->count2 + 128*i + l3];
8604 if (i == 0) {
8605 return -1;
8606 }
8607 return i;
8608}
8609
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008610/* Lookup the character ch in the mapping. If the character
8611 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008612 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008613static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008614charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615{
Christian Heimes217cfd12007-12-02 14:31:20 +00008616 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617 PyObject *x;
8618
8619 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621 x = PyObject_GetItem(mapping, w);
8622 Py_DECREF(w);
8623 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8625 /* No mapping found means: mapping is undefined. */
8626 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008627 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 } else
8629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008631 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008633 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 long value = PyLong_AS_LONG(x);
8635 if (value < 0 || value > 255) {
8636 PyErr_SetString(PyExc_TypeError,
8637 "character mapping must be in range(256)");
8638 Py_DECREF(x);
8639 return NULL;
8640 }
8641 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008643 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 /* wrong return value */
8647 PyErr_Format(PyExc_TypeError,
8648 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008649 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 Py_DECREF(x);
8651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 }
8653}
8654
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008655static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008656charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008657{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008658 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8659 /* exponentially overallocate to minimize reallocations */
8660 if (requiredsize < 2*outsize)
8661 requiredsize = 2*outsize;
8662 if (_PyBytes_Resize(outobj, requiredsize))
8663 return -1;
8664 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008665}
8666
Benjamin Peterson14339b62009-01-31 16:36:08 +00008667typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008669} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008671 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 space is available. Return a new reference to the object that
8673 was put in the output buffer, or Py_None, if the mapping was undefined
8674 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008675 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008676static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008677charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008678 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008680 PyObject *rep;
8681 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008682 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683
Andy Lesterdffe4c02020-03-04 07:15:20 -06008684 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008685 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008687 if (res == -1)
8688 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 if (outsize<requiredsize)
8690 if (charmapencode_resize(outobj, outpos, requiredsize))
8691 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008692 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 outstart[(*outpos)++] = (char)res;
8694 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008695 }
8696
8697 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008700 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 Py_DECREF(rep);
8702 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008703 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 if (PyLong_Check(rep)) {
8705 Py_ssize_t requiredsize = *outpos+1;
8706 if (outsize<requiredsize)
8707 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8708 Py_DECREF(rep);
8709 return enc_EXCEPTION;
8710 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008711 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 else {
8715 const char *repchars = PyBytes_AS_STRING(rep);
8716 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8717 Py_ssize_t requiredsize = *outpos+repsize;
8718 if (outsize<requiredsize)
8719 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8720 Py_DECREF(rep);
8721 return enc_EXCEPTION;
8722 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008723 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 memcpy(outstart + *outpos, repchars, repsize);
8725 *outpos += repsize;
8726 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008728 Py_DECREF(rep);
8729 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730}
8731
8732/* handle an error in PyUnicode_EncodeCharmap
8733 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008734static int
8735charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008736 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008738 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008739 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740{
8741 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008742 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008743 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008744 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008745 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008746 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008748 Py_ssize_t collstartpos = *inpos;
8749 Py_ssize_t collendpos = *inpos+1;
8750 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008751 const char *encoding = "charmap";
8752 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008753 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008754 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008755 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756
Benjamin Petersonbac79492012-01-14 13:34:47 -05008757 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008758 return -1;
8759 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760 /* find all unencodable characters */
8761 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008762 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008763 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008764 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008765 val = encoding_map_lookup(ch, mapping);
8766 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 break;
8768 ++collendpos;
8769 continue;
8770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008771
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008772 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8773 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 if (rep==NULL)
8775 return -1;
8776 else if (rep!=Py_None) {
8777 Py_DECREF(rep);
8778 break;
8779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008780 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 }
8783 /* cache callback name lookup
8784 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008785 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008786 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008787
8788 switch (*error_handler) {
8789 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008790 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008791 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008792
8793 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008794 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 x = charmapencode_output('?', mapping, res, respos);
8796 if (x==enc_EXCEPTION) {
8797 return -1;
8798 }
8799 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008800 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return -1;
8802 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008803 }
8804 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008805 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008806 *inpos = collendpos;
8807 break;
Victor Stinner50149202015-09-22 00:26:54 +02008808
8809 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008810 /* generate replacement (temporarily (mis)uses p) */
8811 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 char buffer[2+29+1+1];
8813 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008814 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 for (cp = buffer; *cp; ++cp) {
8816 x = charmapencode_output(*cp, mapping, res, respos);
8817 if (x==enc_EXCEPTION)
8818 return -1;
8819 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008820 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return -1;
8822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008823 }
8824 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008825 *inpos = collendpos;
8826 break;
Victor Stinner50149202015-09-22 00:26:54 +02008827
Benjamin Peterson14339b62009-01-31 16:36:08 +00008828 default:
Victor Stinner50149202015-09-22 00:26:54 +02008829 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008830 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008832 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008834 if (PyBytes_Check(repunicode)) {
8835 /* Directly copy bytes result to output. */
8836 Py_ssize_t outsize = PyBytes_Size(*res);
8837 Py_ssize_t requiredsize;
8838 repsize = PyBytes_Size(repunicode);
8839 requiredsize = *respos + repsize;
8840 if (requiredsize > outsize)
8841 /* Make room for all additional bytes. */
8842 if (charmapencode_resize(res, respos, requiredsize)) {
8843 Py_DECREF(repunicode);
8844 return -1;
8845 }
8846 memcpy(PyBytes_AsString(*res) + *respos,
8847 PyBytes_AsString(repunicode), repsize);
8848 *respos += repsize;
8849 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008850 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008851 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008852 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008853 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008854 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008855 Py_DECREF(repunicode);
8856 return -1;
8857 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008858 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008859 data = PyUnicode_DATA(repunicode);
8860 kind = PyUnicode_KIND(repunicode);
8861 for (index = 0; index < repsize; index++) {
8862 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8863 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008865 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 return -1;
8867 }
8868 else if (x==enc_FAILED) {
8869 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008870 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 return -1;
8872 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008873 }
8874 *inpos = newpos;
8875 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876 }
8877 return 0;
8878}
8879
Alexander Belopolsky40018472011-02-26 01:02:56 +00008880PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008881_PyUnicode_EncodeCharmap(PyObject *unicode,
8882 PyObject *mapping,
8883 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008885 /* output object */
8886 PyObject *res = NULL;
8887 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008888 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008889 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008890 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008891 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008892 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008893 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008894 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008895 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008896 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897
Benjamin Petersonbac79492012-01-14 13:34:47 -05008898 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008899 return NULL;
8900 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008901 data = PyUnicode_DATA(unicode);
8902 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008903
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904 /* Default to Latin-1 */
8905 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008906 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 /* allocate enough for a simple encoding without
8909 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008910 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008911 if (res == NULL)
8912 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008913 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008916 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008917 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008919 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 if (x==enc_EXCEPTION) /* error */
8921 goto onError;
8922 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008923 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008925 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 &res, &respos)) {
8927 goto onError;
8928 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 else
8931 /* done with this character => adjust input position */
8932 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008935 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008936 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008937 if (_PyBytes_Resize(&res, respos) < 0)
8938 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008939
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008941 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008942 return res;
8943
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945 Py_XDECREF(res);
8946 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008947 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 return NULL;
8949}
8950
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008951/* Deprecated */
8952PyObject *
8953PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8954 Py_ssize_t size,
8955 PyObject *mapping,
8956 const char *errors)
8957{
8958 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008959 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008960 if (unicode == NULL)
8961 return NULL;
8962 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8963 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008964 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008965}
8966
Alexander Belopolsky40018472011-02-26 01:02:56 +00008967PyObject *
8968PyUnicode_AsCharmapString(PyObject *unicode,
8969 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970{
8971 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 PyErr_BadArgument();
8973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008975 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976}
8977
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008978/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008979static void
8980make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008982 Py_ssize_t startpos, Py_ssize_t endpos,
8983 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 *exceptionObject = _PyUnicodeTranslateError_Create(
8987 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 }
8989 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8991 goto onError;
8992 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8993 goto onError;
8994 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8995 goto onError;
8996 return;
8997 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008998 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 }
9000}
9001
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009002/* error handling callback helper:
9003 build arguments, call the callback and check the arguments,
9004 put the result into newpos and return the replacement string, which
9005 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009006static PyObject *
9007unicode_translate_call_errorhandler(const char *errors,
9008 PyObject **errorHandler,
9009 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009011 Py_ssize_t startpos, Py_ssize_t endpos,
9012 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009013{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009014 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009015
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009016 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009017 PyObject *restuple;
9018 PyObject *resunicode;
9019
9020 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009022 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009024 }
9025
9026 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009028 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009030
Petr Viktorinffd97532020-02-11 17:46:57 +01009031 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009034 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009035 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 Py_DECREF(restuple);
9037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009039 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 &resunicode, &i_newpos)) {
9041 Py_DECREF(restuple);
9042 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009043 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009044 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009046 else
9047 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009049 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 Py_DECREF(restuple);
9051 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009052 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009053 Py_INCREF(resunicode);
9054 Py_DECREF(restuple);
9055 return resunicode;
9056}
9057
9058/* Lookup the character ch in the mapping and put the result in result,
9059 which must be decrefed by the caller.
9060 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009061static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063{
Christian Heimes217cfd12007-12-02 14:31:20 +00009064 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009065 PyObject *x;
9066
9067 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009069 x = PyObject_GetItem(mapping, w);
9070 Py_DECREF(w);
9071 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9073 /* No mapping found means: use 1:1 mapping. */
9074 PyErr_Clear();
9075 *result = NULL;
9076 return 0;
9077 } else
9078 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009079 }
9080 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 *result = x;
9082 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009083 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009084 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009086 if (value < 0 || value > MAX_UNICODE) {
9087 PyErr_Format(PyExc_ValueError,
9088 "character mapping must be in range(0x%x)",
9089 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 Py_DECREF(x);
9091 return -1;
9092 }
9093 *result = x;
9094 return 0;
9095 }
9096 else if (PyUnicode_Check(x)) {
9097 *result = x;
9098 return 0;
9099 }
9100 else {
9101 /* wrong return value */
9102 PyErr_SetString(PyExc_TypeError,
9103 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009104 Py_DECREF(x);
9105 return -1;
9106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009107}
Victor Stinner1194ea02014-04-04 19:37:40 +02009108
9109/* lookup the character, write the result into the writer.
9110 Return 1 if the result was written into the writer, return 0 if the mapping
9111 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009112static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009113charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9114 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009115{
Victor Stinner1194ea02014-04-04 19:37:40 +02009116 PyObject *item;
9117
9118 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009120
9121 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009123 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009126 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009127 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009128
9129 if (item == Py_None) {
9130 Py_DECREF(item);
9131 return 0;
9132 }
9133
9134 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009135 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9136 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9137 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009138 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9139 Py_DECREF(item);
9140 return -1;
9141 }
9142 Py_DECREF(item);
9143 return 1;
9144 }
9145
9146 if (!PyUnicode_Check(item)) {
9147 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009149 }
9150
9151 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9152 Py_DECREF(item);
9153 return -1;
9154 }
9155
9156 Py_DECREF(item);
9157 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009158}
9159
Victor Stinner89a76ab2014-04-05 11:44:04 +02009160static int
9161unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9162 Py_UCS1 *translate)
9163{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009164 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009165 int ret = 0;
9166
Victor Stinner89a76ab2014-04-05 11:44:04 +02009167 if (charmaptranslate_lookup(ch, mapping, &item)) {
9168 return -1;
9169 }
9170
9171 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009172 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009173 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009174 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009175 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009176 /* not found => default to 1:1 mapping */
9177 translate[ch] = ch;
9178 return 1;
9179 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009180 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009181 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009182 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9183 used it */
9184 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009185 /* invalid character or character outside ASCII:
9186 skip the fast translate */
9187 goto exit;
9188 }
9189 translate[ch] = (Py_UCS1)replace;
9190 }
9191 else if (PyUnicode_Check(item)) {
9192 Py_UCS4 replace;
9193
9194 if (PyUnicode_READY(item) == -1) {
9195 Py_DECREF(item);
9196 return -1;
9197 }
9198 if (PyUnicode_GET_LENGTH(item) != 1)
9199 goto exit;
9200
9201 replace = PyUnicode_READ_CHAR(item, 0);
9202 if (replace > 127)
9203 goto exit;
9204 translate[ch] = (Py_UCS1)replace;
9205 }
9206 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009207 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009208 goto exit;
9209 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009210 ret = 1;
9211
Benjamin Peterson1365de72014-04-07 20:15:41 -04009212 exit:
9213 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009214 return ret;
9215}
9216
9217/* Fast path for ascii => ascii translation. Return 1 if the whole string
9218 was translated into writer, return 0 if the input string was partially
9219 translated into writer, raise an exception and return -1 on error. */
9220static int
9221unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009222 _PyUnicodeWriter *writer, int ignore,
9223 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009224{
Victor Stinner872b2912014-04-05 14:27:07 +02009225 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009226 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009227 const Py_UCS1 *in, *end;
9228 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009229 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009230
Victor Stinner89a76ab2014-04-05 11:44:04 +02009231 len = PyUnicode_GET_LENGTH(input);
9232
Victor Stinner872b2912014-04-05 14:27:07 +02009233 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009234
9235 in = PyUnicode_1BYTE_DATA(input);
9236 end = in + len;
9237
9238 assert(PyUnicode_IS_ASCII(writer->buffer));
9239 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9240 out = PyUnicode_1BYTE_DATA(writer->buffer);
9241
Victor Stinner872b2912014-04-05 14:27:07 +02009242 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009243 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009244 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009245 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009246 int translate = unicode_fast_translate_lookup(mapping, ch,
9247 ascii_table);
9248 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009249 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009250 if (translate == 0)
9251 goto exit;
9252 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009253 }
Victor Stinner872b2912014-04-05 14:27:07 +02009254 if (ch2 == 0xfe) {
9255 if (ignore)
9256 continue;
9257 goto exit;
9258 }
9259 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009260 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009261 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009262 }
Victor Stinner872b2912014-04-05 14:27:07 +02009263 res = 1;
9264
9265exit:
9266 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009267 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009268 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009269}
9270
Victor Stinner3222da22015-10-01 22:07:32 +02009271static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272_PyUnicode_TranslateCharmap(PyObject *input,
9273 PyObject *mapping,
9274 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009277 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 Py_ssize_t size, i;
9279 int kind;
9280 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009281 _PyUnicodeWriter writer;
9282 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009283 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009284 PyObject *errorHandler = NULL;
9285 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009286 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009287 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009288
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009290 PyErr_BadArgument();
9291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 if (PyUnicode_READY(input) == -1)
9295 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009296 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 kind = PyUnicode_KIND(input);
9298 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009300 if (size == 0)
9301 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009303 /* allocate enough for a simple 1:1 translation without
9304 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009305 _PyUnicodeWriter_Init(&writer);
9306 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009307 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308
Victor Stinner872b2912014-04-05 14:27:07 +02009309 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9310
Victor Stinner33798672016-03-01 21:59:58 +01009311 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009312 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009313 if (PyUnicode_IS_ASCII(input)) {
9314 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9315 if (res < 0) {
9316 _PyUnicodeWriter_Dealloc(&writer);
9317 return NULL;
9318 }
9319 if (res == 1)
9320 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009321 }
Victor Stinner33798672016-03-01 21:59:58 +01009322 else {
9323 i = 0;
9324 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009327 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009328 int translate;
9329 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9330 Py_ssize_t newpos;
9331 /* startpos for collecting untranslatable chars */
9332 Py_ssize_t collstart;
9333 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009334 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335
Victor Stinner1194ea02014-04-04 19:37:40 +02009336 ch = PyUnicode_READ(kind, data, i);
9337 translate = charmaptranslate_output(ch, mapping, &writer);
9338 if (translate < 0)
9339 goto onError;
9340
9341 if (translate != 0) {
9342 /* it worked => adjust input pointer */
9343 ++i;
9344 continue;
9345 }
9346
9347 /* untranslatable character */
9348 collstart = i;
9349 collend = i+1;
9350
9351 /* find all untranslatable characters */
9352 while (collend < size) {
9353 PyObject *x;
9354 ch = PyUnicode_READ(kind, data, collend);
9355 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009356 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009357 Py_XDECREF(x);
9358 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009359 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009360 ++collend;
9361 }
9362
9363 if (ignore) {
9364 i = collend;
9365 }
9366 else {
9367 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9368 reason, input, &exc,
9369 collstart, collend, &newpos);
9370 if (repunicode == NULL)
9371 goto onError;
9372 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009374 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009375 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009376 Py_DECREF(repunicode);
9377 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009378 }
9379 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009380 Py_XDECREF(exc);
9381 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009382 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009385 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009386 Py_XDECREF(exc);
9387 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 return NULL;
9389}
9390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391/* Deprecated. Use PyUnicode_Translate instead. */
9392PyObject *
9393PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9394 Py_ssize_t size,
9395 PyObject *mapping,
9396 const char *errors)
9397{
Christian Heimes5f520f42012-09-11 14:03:25 +02009398 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009399 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 if (!unicode)
9401 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009402 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9403 Py_DECREF(unicode);
9404 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405}
9406
Alexander Belopolsky40018472011-02-26 01:02:56 +00009407PyObject *
9408PyUnicode_Translate(PyObject *str,
9409 PyObject *mapping,
9410 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009412 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009413 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009414 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415}
Tim Petersced69f82003-09-16 20:30:58 +00009416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417PyObject *
9418_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9419{
9420 if (!PyUnicode_Check(unicode)) {
9421 PyErr_BadInternalCall();
9422 return NULL;
9423 }
9424 if (PyUnicode_READY(unicode) == -1)
9425 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009426 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 /* If the string is already ASCII, just return the same string */
9428 Py_INCREF(unicode);
9429 return unicode;
9430 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009431
9432 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9433 PyObject *result = PyUnicode_New(len, 127);
9434 if (result == NULL) {
9435 return NULL;
9436 }
9437
9438 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9439 int kind = PyUnicode_KIND(unicode);
9440 const void *data = PyUnicode_DATA(unicode);
9441 Py_ssize_t i;
9442 for (i = 0; i < len; ++i) {
9443 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9444 if (ch < 127) {
9445 out[i] = ch;
9446 }
9447 else if (Py_UNICODE_ISSPACE(ch)) {
9448 out[i] = ' ';
9449 }
9450 else {
9451 int decimal = Py_UNICODE_TODECIMAL(ch);
9452 if (decimal < 0) {
9453 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009454 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009455 _PyUnicode_LENGTH(result) = i + 1;
9456 break;
9457 }
9458 out[i] = '0' + decimal;
9459 }
9460 }
9461
INADA Naoki16dfca42018-07-14 12:06:43 +09009462 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009463 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464}
9465
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009466PyObject *
9467PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9468 Py_ssize_t length)
9469{
Victor Stinnerf0124502011-11-21 23:12:56 +01009470 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009471 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009472 Py_UCS4 maxchar;
9473 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009474 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009475
Victor Stinner99d7ad02012-02-22 13:37:39 +01009476 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009477 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009478 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009479 if (ch > 127) {
9480 int decimal = Py_UNICODE_TODECIMAL(ch);
9481 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009482 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009483 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009484 }
9485 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009486
9487 /* Copy to a new string */
9488 decimal = PyUnicode_New(length, maxchar);
9489 if (decimal == NULL)
9490 return decimal;
9491 kind = PyUnicode_KIND(decimal);
9492 data = PyUnicode_DATA(decimal);
9493 /* Iterate over code points */
9494 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009495 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009496 if (ch > 127) {
9497 int decimal = Py_UNICODE_TODECIMAL(ch);
9498 if (decimal >= 0)
9499 ch = '0' + decimal;
9500 }
9501 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009503 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009504}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009505/* --- Decimal Encoder ---------------------------------------------------- */
9506
Alexander Belopolsky40018472011-02-26 01:02:56 +00009507int
9508PyUnicode_EncodeDecimal(Py_UNICODE *s,
9509 Py_ssize_t length,
9510 char *output,
9511 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009512{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009513 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009514 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009515 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009516 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009517
9518 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 PyErr_BadArgument();
9520 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009521 }
9522
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009523 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009524 if (unicode == NULL)
9525 return -1;
9526
Victor Stinner42bf7752011-11-21 22:52:58 +01009527 kind = PyUnicode_KIND(unicode);
9528 data = PyUnicode_DATA(unicode);
9529
Victor Stinnerb84d7232011-11-22 01:50:07 +01009530 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009531 PyObject *exc;
9532 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009534 Py_ssize_t startpos;
9535
9536 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009537
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009539 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009540 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 decimal = Py_UNICODE_TODECIMAL(ch);
9544 if (decimal >= 0) {
9545 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009546 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 continue;
9548 }
9549 if (0 < ch && ch < 256) {
9550 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009551 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 continue;
9553 }
Victor Stinner6345be92011-11-25 20:09:01 +01009554
Victor Stinner42bf7752011-11-21 22:52:58 +01009555 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009556 exc = NULL;
9557 raise_encode_exception(&exc, "decimal", unicode,
9558 startpos, startpos+1,
9559 "invalid decimal Unicode string");
9560 Py_XDECREF(exc);
9561 Py_DECREF(unicode);
9562 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009563 }
9564 /* 0-terminate the output string */
9565 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009566 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009567 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009568}
9569
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570/* --- Helpers ------------------------------------------------------------ */
9571
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009572/* helper macro to fixup start/end slice values */
9573#define ADJUST_INDICES(start, end, len) \
9574 if (end > len) \
9575 end = len; \
9576 else if (end < 0) { \
9577 end += len; \
9578 if (end < 0) \
9579 end = 0; \
9580 } \
9581 if (start < 0) { \
9582 start += len; \
9583 if (start < 0) \
9584 start = 0; \
9585 }
9586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 Py_ssize_t end,
9591 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009593 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009594 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 Py_ssize_t len1, len2, result;
9596
9597 kind1 = PyUnicode_KIND(s1);
9598 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009599 if (kind1 < kind2)
9600 return -1;
9601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 len1 = PyUnicode_GET_LENGTH(s1);
9603 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009604 ADJUST_INDICES(start, end, len1);
9605 if (end - start < len2)
9606 return -1;
9607
9608 buf1 = PyUnicode_DATA(s1);
9609 buf2 = PyUnicode_DATA(s2);
9610 if (len2 == 1) {
9611 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9612 result = findchar((const char *)buf1 + kind1*start,
9613 kind1, end - start, ch, direction);
9614 if (result == -1)
9615 return -1;
9616 else
9617 return start + result;
9618 }
9619
9620 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009621 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009622 if (!buf2)
9623 return -2;
9624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625
Victor Stinner794d5672011-10-10 03:21:36 +02009626 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009627 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009628 case PyUnicode_1BYTE_KIND:
9629 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9630 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9631 else
9632 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9633 break;
9634 case PyUnicode_2BYTE_KIND:
9635 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9636 break;
9637 case PyUnicode_4BYTE_KIND:
9638 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9639 break;
9640 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009641 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009642 }
9643 }
9644 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009645 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009646 case PyUnicode_1BYTE_KIND:
9647 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9648 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9649 else
9650 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9651 break;
9652 case PyUnicode_2BYTE_KIND:
9653 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9654 break;
9655 case PyUnicode_4BYTE_KIND:
9656 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9657 break;
9658 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009659 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 }
9662
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009663 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009664 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009665 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666
9667 return result;
9668}
9669
Victor Stinner59423e32018-11-26 13:40:01 +01009670/* _PyUnicode_InsertThousandsGrouping() helper functions */
9671#include "stringlib/localeutil.h"
9672
9673/**
9674 * InsertThousandsGrouping:
9675 * @writer: Unicode writer.
9676 * @n_buffer: Number of characters in @buffer.
9677 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9678 * @d_pos: Start of digits string.
9679 * @n_digits: The number of digits in the string, in which we want
9680 * to put the grouping chars.
9681 * @min_width: The minimum width of the digits in the output string.
9682 * Output will be zero-padded on the left to fill.
9683 * @grouping: see definition in localeconv().
9684 * @thousands_sep: see definition in localeconv().
9685 *
9686 * There are 2 modes: counting and filling. If @writer is NULL,
9687 * we are in counting mode, else filling mode.
9688 * If counting, the required buffer size is returned.
9689 * If filling, we know the buffer will be large enough, so we don't
9690 * need to pass in the buffer size.
9691 * Inserts thousand grouping characters (as defined by grouping and
9692 * thousands_sep) into @writer.
9693 *
9694 * Return value: -1 on error, number of characters otherwise.
9695 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009697_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009698 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009699 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009700 PyObject *digits,
9701 Py_ssize_t d_pos,
9702 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009703 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009704 const char *grouping,
9705 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009706 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707{
Xtreak3f7983a2019-01-07 20:39:14 +05309708 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009709 if (writer) {
9710 assert(digits != NULL);
9711 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009712 }
9713 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009714 assert(digits == NULL);
9715 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009716 }
Victor Stinner59423e32018-11-26 13:40:01 +01009717 assert(0 <= d_pos);
9718 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009719 assert(grouping != NULL);
9720
9721 if (digits != NULL) {
9722 if (PyUnicode_READY(digits) == -1) {
9723 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009724 }
Victor Stinner59423e32018-11-26 13:40:01 +01009725 }
9726 if (PyUnicode_READY(thousands_sep) == -1) {
9727 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009728 }
9729
Victor Stinner59423e32018-11-26 13:40:01 +01009730 Py_ssize_t count = 0;
9731 Py_ssize_t n_zeros;
9732 int loop_broken = 0;
9733 int use_separator = 0; /* First time through, don't append the
9734 separator. They only go between
9735 groups. */
9736 Py_ssize_t buffer_pos;
9737 Py_ssize_t digits_pos;
9738 Py_ssize_t len;
9739 Py_ssize_t n_chars;
9740 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9741 be looked at */
9742 /* A generator that returns all of the grouping widths, until it
9743 returns 0. */
9744 GroupGenerator groupgen;
9745 GroupGenerator_init(&groupgen, grouping);
9746 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9747
9748 /* if digits are not grouped, thousands separator
9749 should be an empty string */
9750 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9751
9752 digits_pos = d_pos + n_digits;
9753 if (writer) {
9754 buffer_pos = writer->pos + n_buffer;
9755 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9756 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 }
Victor Stinner59423e32018-11-26 13:40:01 +01009758 else {
9759 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009760 }
Victor Stinner59423e32018-11-26 13:40:01 +01009761
9762 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009763 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009764 }
Victor Stinner59423e32018-11-26 13:40:01 +01009765
9766 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9767 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9768 n_zeros = Py_MAX(0, len - remaining);
9769 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9770
9771 /* Use n_zero zero's and n_chars chars */
9772
9773 /* Count only, don't do anything. */
9774 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9775
9776 /* Copy into the writer. */
9777 InsertThousandsGrouping_fill(writer, &buffer_pos,
9778 digits, &digits_pos,
9779 n_chars, n_zeros,
9780 use_separator ? thousands_sep : NULL,
9781 thousands_sep_len, maxchar);
9782
9783 /* Use a separator next time. */
9784 use_separator = 1;
9785
9786 remaining -= n_chars;
9787 min_width -= len;
9788
9789 if (remaining <= 0 && min_width <= 0) {
9790 loop_broken = 1;
9791 break;
9792 }
9793 min_width -= thousands_sep_len;
9794 }
9795 if (!loop_broken) {
9796 /* We left the loop without using a break statement. */
9797
9798 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9799 n_zeros = Py_MAX(0, len - remaining);
9800 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9801
9802 /* Use n_zero zero's and n_chars chars */
9803 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9804
9805 /* Copy into the writer. */
9806 InsertThousandsGrouping_fill(writer, &buffer_pos,
9807 digits, &digits_pos,
9808 n_chars, n_zeros,
9809 use_separator ? thousands_sep : NULL,
9810 thousands_sep_len, maxchar);
9811 }
9812 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813}
9814
9815
Alexander Belopolsky40018472011-02-26 01:02:56 +00009816Py_ssize_t
9817PyUnicode_Count(PyObject *str,
9818 PyObject *substr,
9819 Py_ssize_t start,
9820 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009822 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009823 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009824 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009826
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009827 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009828 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009829
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009830 kind1 = PyUnicode_KIND(str);
9831 kind2 = PyUnicode_KIND(substr);
9832 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009833 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009834
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009835 len1 = PyUnicode_GET_LENGTH(str);
9836 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009838 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009839 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009840
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009841 buf1 = PyUnicode_DATA(str);
9842 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009843 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009844 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009845 if (!buf2)
9846 goto onError;
9847 }
9848
9849 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009851 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009852 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009853 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009854 buf2, len2, PY_SSIZE_T_MAX
9855 );
9856 else
9857 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009858 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009859 buf2, len2, PY_SSIZE_T_MAX
9860 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 break;
9862 case PyUnicode_2BYTE_KIND:
9863 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009864 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 buf2, len2, PY_SSIZE_T_MAX
9866 );
9867 break;
9868 case PyUnicode_4BYTE_KIND:
9869 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009870 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 buf2, len2, PY_SSIZE_T_MAX
9872 );
9873 break;
9874 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009875 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009877
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009878 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009879 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009880 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009884 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9885 if (kind2 != kind1)
9886 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888}
9889
Alexander Belopolsky40018472011-02-26 01:02:56 +00009890Py_ssize_t
9891PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009892 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009893 Py_ssize_t start,
9894 Py_ssize_t end,
9895 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009897 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009898 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009899
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009900 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901}
9902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903Py_ssize_t
9904PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9905 Py_ssize_t start, Py_ssize_t end,
9906 int direction)
9907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009909 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (PyUnicode_READY(str) == -1)
9911 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009912 len = PyUnicode_GET_LENGTH(str);
9913 ADJUST_INDICES(start, end, len);
9914 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009915 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009917 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9918 kind, end-start, ch, direction);
9919 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009921 else
9922 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923}
9924
Alexander Belopolsky40018472011-02-26 01:02:56 +00009925static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009926tailmatch(PyObject *self,
9927 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009928 Py_ssize_t start,
9929 Py_ssize_t end,
9930 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 int kind_self;
9933 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009934 const void *data_self;
9935 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 Py_ssize_t offset;
9937 Py_ssize_t i;
9938 Py_ssize_t end_sub;
9939
9940 if (PyUnicode_READY(self) == -1 ||
9941 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009942 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9945 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009949 if (PyUnicode_GET_LENGTH(substring) == 0)
9950 return 1;
9951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 kind_self = PyUnicode_KIND(self);
9953 data_self = PyUnicode_DATA(self);
9954 kind_sub = PyUnicode_KIND(substring);
9955 data_sub = PyUnicode_DATA(substring);
9956 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9957
9958 if (direction > 0)
9959 offset = end;
9960 else
9961 offset = start;
9962
9963 if (PyUnicode_READ(kind_self, data_self, offset) ==
9964 PyUnicode_READ(kind_sub, data_sub, 0) &&
9965 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9966 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9967 /* If both are of the same kind, memcmp is sufficient */
9968 if (kind_self == kind_sub) {
9969 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009970 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 data_sub,
9972 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009973 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009975 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 else {
9977 /* We do not need to compare 0 and len(substring)-1 because
9978 the if statement above ensured already that they are equal
9979 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 for (i = 1; i < end_sub; ++i) {
9981 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9982 PyUnicode_READ(kind_sub, data_sub, i))
9983 return 0;
9984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987 }
9988
9989 return 0;
9990}
9991
Alexander Belopolsky40018472011-02-26 01:02:56 +00009992Py_ssize_t
9993PyUnicode_Tailmatch(PyObject *str,
9994 PyObject *substr,
9995 Py_ssize_t start,
9996 Py_ssize_t end,
9997 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009999 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010000 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010001
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010002 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003}
10004
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010005static PyObject *
10006ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010008 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010009 const char *data = PyUnicode_DATA(self);
10010 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010011 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010012
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010013 res = PyUnicode_New(len, 127);
10014 if (res == NULL)
10015 return NULL;
10016 resdata = PyUnicode_DATA(res);
10017 if (lower)
10018 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010020 _Py_bytes_upper(resdata, data, len);
10021 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022}
10023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010025handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010027 Py_ssize_t j;
10028 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010029 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010030 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010031
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010032 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10033
10034 where ! is a negation and \p{xxx} is a character with property xxx.
10035 */
10036 for (j = i - 1; j >= 0; j--) {
10037 c = PyUnicode_READ(kind, data, j);
10038 if (!_PyUnicode_IsCaseIgnorable(c))
10039 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010041 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10042 if (final_sigma) {
10043 for (j = i + 1; j < length; j++) {
10044 c = PyUnicode_READ(kind, data, j);
10045 if (!_PyUnicode_IsCaseIgnorable(c))
10046 break;
10047 }
10048 final_sigma = j == length || !_PyUnicode_IsCased(c);
10049 }
10050 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051}
10052
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010053static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010054lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010055 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010057 /* Obscure special case. */
10058 if (c == 0x3A3) {
10059 mapped[0] = handle_capital_sigma(kind, data, length, i);
10060 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010062 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063}
10064
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010065static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010066do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010068 Py_ssize_t i, k = 0;
10069 int n_res, j;
10070 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010071
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010072 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010073 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010074 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010075 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010076 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010078 for (i = 1; i < length; i++) {
10079 c = PyUnicode_READ(kind, data, i);
10080 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10081 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010082 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010083 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010084 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010085 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087}
10088
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010089static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010090do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010091 Py_ssize_t i, k = 0;
10092
10093 for (i = 0; i < length; i++) {
10094 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10095 int n_res, j;
10096 if (Py_UNICODE_ISUPPER(c)) {
10097 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10098 }
10099 else if (Py_UNICODE_ISLOWER(c)) {
10100 n_res = _PyUnicode_ToUpperFull(c, mapped);
10101 }
10102 else {
10103 n_res = 1;
10104 mapped[0] = c;
10105 }
10106 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010107 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010108 res[k++] = mapped[j];
10109 }
10110 }
10111 return k;
10112}
10113
10114static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010115do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010116 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010118 Py_ssize_t i, k = 0;
10119
10120 for (i = 0; i < length; i++) {
10121 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10122 int n_res, j;
10123 if (lower)
10124 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10125 else
10126 n_res = _PyUnicode_ToUpperFull(c, mapped);
10127 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010128 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010129 res[k++] = mapped[j];
10130 }
10131 }
10132 return k;
10133}
10134
10135static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010136do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010137{
10138 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10139}
10140
10141static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010142do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010143{
10144 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10145}
10146
Benjamin Petersone51757f2012-01-12 21:10:29 -050010147static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010148do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010149{
10150 Py_ssize_t i, k = 0;
10151
10152 for (i = 0; i < length; i++) {
10153 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10154 Py_UCS4 mapped[3];
10155 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10156 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010157 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010158 res[k++] = mapped[j];
10159 }
10160 }
10161 return k;
10162}
10163
10164static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010165do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010166{
10167 Py_ssize_t i, k = 0;
10168 int previous_is_cased;
10169
10170 previous_is_cased = 0;
10171 for (i = 0; i < length; i++) {
10172 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10173 Py_UCS4 mapped[3];
10174 int n_res, j;
10175
10176 if (previous_is_cased)
10177 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10178 else
10179 n_res = _PyUnicode_ToTitleFull(c, mapped);
10180
10181 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010182 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010183 res[k++] = mapped[j];
10184 }
10185
10186 previous_is_cased = _PyUnicode_IsCased(c);
10187 }
10188 return k;
10189}
10190
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010191static PyObject *
10192case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010193 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010194{
10195 PyObject *res = NULL;
10196 Py_ssize_t length, newlength = 0;
10197 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010198 const void *data;
10199 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010200 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10201
Benjamin Petersoneea48462012-01-16 14:28:50 -050010202 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010203
10204 kind = PyUnicode_KIND(self);
10205 data = PyUnicode_DATA(self);
10206 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010207 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010208 PyErr_SetString(PyExc_OverflowError, "string is too long");
10209 return NULL;
10210 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010211 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010212 if (tmp == NULL)
10213 return PyErr_NoMemory();
10214 newlength = perform(kind, data, length, tmp, &maxchar);
10215 res = PyUnicode_New(newlength, maxchar);
10216 if (res == NULL)
10217 goto leave;
10218 tmpend = tmp + newlength;
10219 outdata = PyUnicode_DATA(res);
10220 outkind = PyUnicode_KIND(res);
10221 switch (outkind) {
10222 case PyUnicode_1BYTE_KIND:
10223 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10224 break;
10225 case PyUnicode_2BYTE_KIND:
10226 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10227 break;
10228 case PyUnicode_4BYTE_KIND:
10229 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10230 break;
10231 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010232 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010233 }
10234 leave:
10235 PyMem_FREE(tmp);
10236 return res;
10237}
10238
Tim Peters8ce9f162004-08-27 01:49:32 +000010239PyObject *
10240PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010242 PyObject *res;
10243 PyObject *fseq;
10244 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010245 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010247 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010248 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010249 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010250 }
10251
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010252 /* NOTE: the following code can't call back into Python code,
10253 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010254 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010255
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010256 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010257 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010258 res = _PyUnicode_JoinArray(separator, items, seqlen);
10259 Py_DECREF(fseq);
10260 return res;
10261}
10262
10263PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010264_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010265{
10266 PyObject *res = NULL; /* the result */
10267 PyObject *sep = NULL;
10268 Py_ssize_t seplen;
10269 PyObject *item;
10270 Py_ssize_t sz, i, res_offset;
10271 Py_UCS4 maxchar;
10272 Py_UCS4 item_maxchar;
10273 int use_memcpy;
10274 unsigned char *res_data = NULL, *sep_data = NULL;
10275 PyObject *last_obj;
10276 unsigned int kind = 0;
10277
Tim Peters05eba1f2004-08-27 21:32:02 +000010278 /* If empty sequence, return u"". */
10279 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010280 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010281 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010282
Tim Peters05eba1f2004-08-27 21:32:02 +000010283 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010284 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010285 if (seqlen == 1) {
10286 if (PyUnicode_CheckExact(items[0])) {
10287 res = items[0];
10288 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010289 return res;
10290 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010291 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010292 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010293 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010294 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010295 /* Set up sep and seplen */
10296 if (separator == NULL) {
10297 /* fall back to a blank space separator */
10298 sep = PyUnicode_FromOrdinal(' ');
10299 if (!sep)
10300 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010301 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010302 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010303 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010304 else {
10305 if (!PyUnicode_Check(separator)) {
10306 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010307 "separator: expected str instance,"
10308 " %.80s found",
10309 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010310 goto onError;
10311 }
10312 if (PyUnicode_READY(separator))
10313 goto onError;
10314 sep = separator;
10315 seplen = PyUnicode_GET_LENGTH(separator);
10316 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10317 /* inc refcount to keep this code path symmetric with the
10318 above case of a blank separator */
10319 Py_INCREF(sep);
10320 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010321 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010322 }
10323
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010324 /* There are at least two things to join, or else we have a subclass
10325 * of str in the sequence.
10326 * Do a pre-pass to figure out the total amount of space we'll
10327 * need (sz), and see whether all argument are strings.
10328 */
10329 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010330#ifdef Py_DEBUG
10331 use_memcpy = 0;
10332#else
10333 use_memcpy = 1;
10334#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010335 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010336 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010337 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010338 if (!PyUnicode_Check(item)) {
10339 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010340 "sequence item %zd: expected str instance,"
10341 " %.80s found",
10342 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010343 goto onError;
10344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 if (PyUnicode_READY(item) == -1)
10346 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010347 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010349 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010350 if (i != 0) {
10351 add_sz += seplen;
10352 }
10353 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010354 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010355 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010356 goto onError;
10357 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010358 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010359 if (use_memcpy && last_obj != NULL) {
10360 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10361 use_memcpy = 0;
10362 }
10363 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010364 }
Tim Petersced69f82003-09-16 20:30:58 +000010365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010367 if (res == NULL)
10368 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010369
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010370 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010371#ifdef Py_DEBUG
10372 use_memcpy = 0;
10373#else
10374 if (use_memcpy) {
10375 res_data = PyUnicode_1BYTE_DATA(res);
10376 kind = PyUnicode_KIND(res);
10377 if (seplen != 0)
10378 sep_data = PyUnicode_1BYTE_DATA(sep);
10379 }
10380#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010381 if (use_memcpy) {
10382 for (i = 0; i < seqlen; ++i) {
10383 Py_ssize_t itemlen;
10384 item = items[i];
10385
10386 /* Copy item, and maybe the separator. */
10387 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010388 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010389 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010390 kind * seplen);
10391 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010392 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010393
10394 itemlen = PyUnicode_GET_LENGTH(item);
10395 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010396 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010397 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010398 kind * itemlen);
10399 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010400 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010401 }
10402 assert(res_data == PyUnicode_1BYTE_DATA(res)
10403 + kind * PyUnicode_GET_LENGTH(res));
10404 }
10405 else {
10406 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10407 Py_ssize_t itemlen;
10408 item = items[i];
10409
10410 /* Copy item, and maybe the separator. */
10411 if (i && seplen != 0) {
10412 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10413 res_offset += seplen;
10414 }
10415
10416 itemlen = PyUnicode_GET_LENGTH(item);
10417 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010418 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010419 res_offset += itemlen;
10420 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010421 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010422 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010423 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010426 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010431 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432 return NULL;
10433}
10434
Victor Stinnerd3f08822012-05-29 12:57:52 +020010435void
10436_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10437 Py_UCS4 fill_char)
10438{
10439 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010440 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010441 assert(PyUnicode_IS_READY(unicode));
10442 assert(unicode_modifiable(unicode));
10443 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10444 assert(start >= 0);
10445 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010446 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010447}
10448
Victor Stinner3fe55312012-01-04 00:33:50 +010010449Py_ssize_t
10450PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10451 Py_UCS4 fill_char)
10452{
10453 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010454
10455 if (!PyUnicode_Check(unicode)) {
10456 PyErr_BadInternalCall();
10457 return -1;
10458 }
10459 if (PyUnicode_READY(unicode) == -1)
10460 return -1;
10461 if (unicode_check_modifiable(unicode))
10462 return -1;
10463
Victor Stinnerd3f08822012-05-29 12:57:52 +020010464 if (start < 0) {
10465 PyErr_SetString(PyExc_IndexError, "string index out of range");
10466 return -1;
10467 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010468 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10469 PyErr_SetString(PyExc_ValueError,
10470 "fill character is bigger than "
10471 "the string maximum character");
10472 return -1;
10473 }
10474
10475 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10476 length = Py_MIN(maxlen, length);
10477 if (length <= 0)
10478 return 0;
10479
Victor Stinnerd3f08822012-05-29 12:57:52 +020010480 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010481 return length;
10482}
10483
Victor Stinner9310abb2011-10-05 00:59:23 +020010484static PyObject *
10485pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010486 Py_ssize_t left,
10487 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 PyObject *u;
10491 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010492 int kind;
10493 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494
10495 if (left < 0)
10496 left = 0;
10497 if (right < 0)
10498 right = 0;
10499
Victor Stinnerc4b49542011-12-11 22:44:26 +010010500 if (left == 0 && right == 0)
10501 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10504 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010505 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10506 return NULL;
10507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010509 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010511 if (!u)
10512 return NULL;
10513
10514 kind = PyUnicode_KIND(u);
10515 data = PyUnicode_DATA(u);
10516 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010517 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010518 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010519 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010520 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010521 assert(_PyUnicode_CheckConsistency(u, 1));
10522 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523}
10524
Alexander Belopolsky40018472011-02-26 01:02:56 +000010525PyObject *
10526PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010530 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Benjamin Petersonead6b532011-12-20 17:23:42 -060010533 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010535 if (PyUnicode_IS_ASCII(string))
10536 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010537 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010538 PyUnicode_GET_LENGTH(string), keepends);
10539 else
10540 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010541 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010542 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 break;
10544 case PyUnicode_2BYTE_KIND:
10545 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010546 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 PyUnicode_GET_LENGTH(string), keepends);
10548 break;
10549 case PyUnicode_4BYTE_KIND:
10550 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010551 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 PyUnicode_GET_LENGTH(string), keepends);
10553 break;
10554 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010555 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558}
10559
Alexander Belopolsky40018472011-02-26 01:02:56 +000010560static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010561split(PyObject *self,
10562 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010563 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010565 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010566 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 Py_ssize_t len1, len2;
10568 PyObject* out;
10569
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010571 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 if (PyUnicode_READY(self) == -1)
10574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010577 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010579 if (PyUnicode_IS_ASCII(self))
10580 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010581 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010582 PyUnicode_GET_LENGTH(self), maxcount
10583 );
10584 else
10585 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010586 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 PyUnicode_GET_LENGTH(self), maxcount
10588 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 case PyUnicode_2BYTE_KIND:
10590 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010591 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 PyUnicode_GET_LENGTH(self), maxcount
10593 );
10594 case PyUnicode_4BYTE_KIND:
10595 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010596 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 PyUnicode_GET_LENGTH(self), maxcount
10598 );
10599 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010600 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 }
10602
10603 if (PyUnicode_READY(substring) == -1)
10604 return NULL;
10605
10606 kind1 = PyUnicode_KIND(self);
10607 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 len1 = PyUnicode_GET_LENGTH(self);
10609 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010610 if (kind1 < kind2 || len1 < len2) {
10611 out = PyList_New(1);
10612 if (out == NULL)
10613 return NULL;
10614 Py_INCREF(self);
10615 PyList_SET_ITEM(out, 0, self);
10616 return out;
10617 }
10618 buf1 = PyUnicode_DATA(self);
10619 buf2 = PyUnicode_DATA(substring);
10620 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010621 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010622 if (!buf2)
10623 return NULL;
10624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010626 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010628 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10629 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010630 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010631 else
10632 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010633 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 break;
10635 case PyUnicode_2BYTE_KIND:
10636 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010637 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 break;
10639 case PyUnicode_4BYTE_KIND:
10640 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010641 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 break;
10643 default:
10644 out = NULL;
10645 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010646 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010647 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010648 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010650}
10651
Alexander Belopolsky40018472011-02-26 01:02:56 +000010652static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010653rsplit(PyObject *self,
10654 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010655 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010656{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010657 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010658 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 Py_ssize_t len1, len2;
10660 PyObject* out;
10661
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010662 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010663 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (PyUnicode_READY(self) == -1)
10666 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010669 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010671 if (PyUnicode_IS_ASCII(self))
10672 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010673 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010674 PyUnicode_GET_LENGTH(self), maxcount
10675 );
10676 else
10677 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010678 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010679 PyUnicode_GET_LENGTH(self), maxcount
10680 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 case PyUnicode_2BYTE_KIND:
10682 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010683 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 PyUnicode_GET_LENGTH(self), maxcount
10685 );
10686 case PyUnicode_4BYTE_KIND:
10687 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010688 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 PyUnicode_GET_LENGTH(self), maxcount
10690 );
10691 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010692 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 }
10694
10695 if (PyUnicode_READY(substring) == -1)
10696 return NULL;
10697
10698 kind1 = PyUnicode_KIND(self);
10699 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 len1 = PyUnicode_GET_LENGTH(self);
10701 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010702 if (kind1 < kind2 || len1 < len2) {
10703 out = PyList_New(1);
10704 if (out == NULL)
10705 return NULL;
10706 Py_INCREF(self);
10707 PyList_SET_ITEM(out, 0, self);
10708 return out;
10709 }
10710 buf1 = PyUnicode_DATA(self);
10711 buf2 = PyUnicode_DATA(substring);
10712 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010713 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010714 if (!buf2)
10715 return NULL;
10716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010718 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010720 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10721 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010722 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010723 else
10724 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010725 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 break;
10727 case PyUnicode_2BYTE_KIND:
10728 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010729 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 break;
10731 case PyUnicode_4BYTE_KIND:
10732 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010733 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 break;
10735 default:
10736 out = NULL;
10737 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010738 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010739 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010740 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 return out;
10742}
10743
10744static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010745anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10746 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010748 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010750 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10751 return asciilib_find(buf1, len1, buf2, len2, offset);
10752 else
10753 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 case PyUnicode_2BYTE_KIND:
10755 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10756 case PyUnicode_4BYTE_KIND:
10757 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10758 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010759 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760}
10761
10762static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010763anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10764 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010766 switch (kind) {
10767 case PyUnicode_1BYTE_KIND:
10768 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10769 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10770 else
10771 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10772 case PyUnicode_2BYTE_KIND:
10773 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10774 case PyUnicode_4BYTE_KIND:
10775 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10776 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010777 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010778}
10779
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010780static void
10781replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10782 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10783{
10784 int kind = PyUnicode_KIND(u);
10785 void *data = PyUnicode_DATA(u);
10786 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10787 if (kind == PyUnicode_1BYTE_KIND) {
10788 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10789 (Py_UCS1 *)data + len,
10790 u1, u2, maxcount);
10791 }
10792 else if (kind == PyUnicode_2BYTE_KIND) {
10793 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10794 (Py_UCS2 *)data + len,
10795 u1, u2, maxcount);
10796 }
10797 else {
10798 assert(kind == PyUnicode_4BYTE_KIND);
10799 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10800 (Py_UCS4 *)data + len,
10801 u1, u2, maxcount);
10802 }
10803}
10804
Alexander Belopolsky40018472011-02-26 01:02:56 +000010805static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806replace(PyObject *self, PyObject *str1,
10807 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010810 const char *sbuf = PyUnicode_DATA(self);
10811 const void *buf1 = PyUnicode_DATA(str1);
10812 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 int srelease = 0, release1 = 0, release2 = 0;
10814 int skind = PyUnicode_KIND(self);
10815 int kind1 = PyUnicode_KIND(str1);
10816 int kind2 = PyUnicode_KIND(str2);
10817 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10818 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10819 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010820 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010821 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010823 if (slen < len1)
10824 goto nothing;
10825
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010828 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010829 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
Victor Stinner59de0ee2011-10-07 10:01:28 +020010831 if (str1 == str2)
10832 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833
Victor Stinner49a0a212011-10-12 23:46:10 +020010834 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010835 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10836 if (maxchar < maxchar_str1)
10837 /* substring too wide to be present */
10838 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010839 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10840 /* Replacing str1 with str2 may cause a maxchar reduction in the
10841 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010842 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010843 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010846 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010848 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010850 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010851 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010852 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010853
Victor Stinner69ed0f42013-04-09 21:48:24 +020010854 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010855 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010856 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010857 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010858 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010860 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010862
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010863 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10864 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010865 }
10866 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 int rkind = skind;
10868 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010869 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 if (kind1 < rkind) {
10872 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010873 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 if (!buf1) goto error;
10875 release1 = 1;
10876 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010877 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010878 if (i < 0)
10879 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 if (rkind > kind2) {
10881 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010882 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 if (!buf2) goto error;
10884 release2 = 1;
10885 }
10886 else if (rkind < kind2) {
10887 /* widen self and buf1 */
10888 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010889 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010890 assert(buf1 != PyUnicode_DATA(str1));
10891 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010892 buf1 = PyUnicode_DATA(str1);
10893 release1 = 0;
10894 }
10895 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 if (!sbuf) goto error;
10897 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010898 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 if (!buf1) goto error;
10900 release1 = 1;
10901 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010902 u = PyUnicode_New(slen, maxchar);
10903 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010905 assert(PyUnicode_KIND(u) == rkind);
10906 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010907
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010908 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010909 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010910 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010912 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010914
10915 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010916 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010917 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010918 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010919 if (i == -1)
10920 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010921 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010923 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010927 }
10928 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010930 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 int rkind = skind;
10932 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010935 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010936 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (!buf1) goto error;
10938 release1 = 1;
10939 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010940 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010941 if (n == 0)
10942 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010944 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010945 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 if (!buf2) goto error;
10947 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010950 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010952 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 if (!sbuf) goto error;
10954 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010955 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010956 assert(buf1 != PyUnicode_DATA(str1));
10957 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010958 buf1 = PyUnicode_DATA(str1);
10959 release1 = 0;
10960 }
10961 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 if (!buf1) goto error;
10963 release1 = 1;
10964 }
10965 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10966 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010967 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 PyErr_SetString(PyExc_OverflowError,
10969 "replace string is too long");
10970 goto error;
10971 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010972 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010973 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010974 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010975 goto done;
10976 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010977 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 PyErr_SetString(PyExc_OverflowError,
10979 "replace string is too long");
10980 goto error;
10981 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010982 u = PyUnicode_New(new_size, maxchar);
10983 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010985 assert(PyUnicode_KIND(u) == rkind);
10986 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 ires = i = 0;
10988 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010989 while (n-- > 0) {
10990 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010991 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010992 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010993 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010994 if (j == -1)
10995 break;
10996 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010997 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010998 memcpy(res + rkind * ires,
10999 sbuf + rkind * i,
11000 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011002 }
11003 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011005 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011007 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011013 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011014 memcpy(res + rkind * ires,
11015 sbuf + rkind * i,
11016 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011017 }
11018 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011019 /* interleave */
11020 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011021 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011023 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011025 if (--n <= 0)
11026 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011027 memcpy(res + rkind * ires,
11028 sbuf + rkind * i,
11029 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 ires++;
11031 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011032 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011033 memcpy(res + rkind * ires,
11034 sbuf + rkind * i,
11035 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011036 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011037 }
11038
11039 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011040 unicode_adjust_maxchar(&u);
11041 if (u == NULL)
11042 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011044
11045 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011046 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11047 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11048 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011050 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011052 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011054 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011055 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011057
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011059 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011060 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11061 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11062 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011064 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011066 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011068 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011069 return unicode_result_unchanged(self);
11070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011072 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11073 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11074 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11075 if (srelease)
11076 PyMem_FREE((void *)sbuf);
11077 if (release1)
11078 PyMem_FREE((void *)buf1);
11079 if (release2)
11080 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082}
11083
11084/* --- Unicode Object Methods --------------------------------------------- */
11085
INADA Naoki3ae20562017-01-16 20:41:20 +090011086/*[clinic input]
11087str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
INADA Naoki3ae20562017-01-16 20:41:20 +090011089Return a version of the string where each word is titlecased.
11090
11091More specifically, words start with uppercased characters and all remaining
11092cased characters have lower case.
11093[clinic start generated code]*/
11094
11095static PyObject *
11096unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011097/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011099 if (PyUnicode_READY(self) == -1)
11100 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011101 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102}
11103
INADA Naoki3ae20562017-01-16 20:41:20 +090011104/*[clinic input]
11105str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106
INADA Naoki3ae20562017-01-16 20:41:20 +090011107Return a capitalized version of the string.
11108
11109More specifically, make the first character have upper case and the rest lower
11110case.
11111[clinic start generated code]*/
11112
11113static PyObject *
11114unicode_capitalize_impl(PyObject *self)
11115/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011117 if (PyUnicode_READY(self) == -1)
11118 return NULL;
11119 if (PyUnicode_GET_LENGTH(self) == 0)
11120 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011121 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122}
11123
INADA Naoki3ae20562017-01-16 20:41:20 +090011124/*[clinic input]
11125str.casefold as unicode_casefold
11126
11127Return a version of the string suitable for caseless comparisons.
11128[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011129
11130static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011131unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011132/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011133{
11134 if (PyUnicode_READY(self) == -1)
11135 return NULL;
11136 if (PyUnicode_IS_ASCII(self))
11137 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011138 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011139}
11140
11141
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011142/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011143
11144static int
11145convert_uc(PyObject *obj, void *addr)
11146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011148
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011149 if (!PyUnicode_Check(obj)) {
11150 PyErr_Format(PyExc_TypeError,
11151 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011152 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011153 return 0;
11154 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011155 if (PyUnicode_READY(obj) < 0)
11156 return 0;
11157 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011158 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011159 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011160 return 0;
11161 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011162 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011163 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011164}
11165
INADA Naoki3ae20562017-01-16 20:41:20 +090011166/*[clinic input]
11167str.center as unicode_center
11168
11169 width: Py_ssize_t
11170 fillchar: Py_UCS4 = ' '
11171 /
11172
11173Return a centered string of length width.
11174
11175Padding is done using the specified fill character (default is a space).
11176[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177
11178static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011179unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11180/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011182 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
Benjamin Petersonbac79492012-01-14 13:34:47 -050011184 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 return NULL;
11186
Victor Stinnerc4b49542011-12-11 22:44:26 +010011187 if (PyUnicode_GET_LENGTH(self) >= width)
11188 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189
Victor Stinnerc4b49542011-12-11 22:44:26 +010011190 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 left = marg / 2 + (marg & width & 1);
11192
Victor Stinner9310abb2011-10-05 00:59:23 +020011193 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194}
11195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196/* This function assumes that str1 and str2 are readied by the caller. */
11197
Marc-André Lemburge5034372000-08-08 08:04:29 +000011198static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011199unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011200{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011201#define COMPARE(TYPE1, TYPE2) \
11202 do { \
11203 TYPE1* p1 = (TYPE1 *)data1; \
11204 TYPE2* p2 = (TYPE2 *)data2; \
11205 TYPE1* end = p1 + len; \
11206 Py_UCS4 c1, c2; \
11207 for (; p1 != end; p1++, p2++) { \
11208 c1 = *p1; \
11209 c2 = *p2; \
11210 if (c1 != c2) \
11211 return (c1 < c2) ? -1 : 1; \
11212 } \
11213 } \
11214 while (0)
11215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011217 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011218 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 kind1 = PyUnicode_KIND(str1);
11221 kind2 = PyUnicode_KIND(str2);
11222 data1 = PyUnicode_DATA(str1);
11223 data2 = PyUnicode_DATA(str2);
11224 len1 = PyUnicode_GET_LENGTH(str1);
11225 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011226 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011227
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011228 switch(kind1) {
11229 case PyUnicode_1BYTE_KIND:
11230 {
11231 switch(kind2) {
11232 case PyUnicode_1BYTE_KIND:
11233 {
11234 int cmp = memcmp(data1, data2, len);
11235 /* normalize result of memcmp() into the range [-1; 1] */
11236 if (cmp < 0)
11237 return -1;
11238 if (cmp > 0)
11239 return 1;
11240 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011241 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011242 case PyUnicode_2BYTE_KIND:
11243 COMPARE(Py_UCS1, Py_UCS2);
11244 break;
11245 case PyUnicode_4BYTE_KIND:
11246 COMPARE(Py_UCS1, Py_UCS4);
11247 break;
11248 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011249 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011250 }
11251 break;
11252 }
11253 case PyUnicode_2BYTE_KIND:
11254 {
11255 switch(kind2) {
11256 case PyUnicode_1BYTE_KIND:
11257 COMPARE(Py_UCS2, Py_UCS1);
11258 break;
11259 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011260 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011261 COMPARE(Py_UCS2, Py_UCS2);
11262 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011263 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011264 case PyUnicode_4BYTE_KIND:
11265 COMPARE(Py_UCS2, Py_UCS4);
11266 break;
11267 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011268 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011269 }
11270 break;
11271 }
11272 case PyUnicode_4BYTE_KIND:
11273 {
11274 switch(kind2) {
11275 case PyUnicode_1BYTE_KIND:
11276 COMPARE(Py_UCS4, Py_UCS1);
11277 break;
11278 case PyUnicode_2BYTE_KIND:
11279 COMPARE(Py_UCS4, Py_UCS2);
11280 break;
11281 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011282 {
11283#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11284 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11285 /* normalize result of wmemcmp() into the range [-1; 1] */
11286 if (cmp < 0)
11287 return -1;
11288 if (cmp > 0)
11289 return 1;
11290#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011291 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011292#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011293 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011294 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011295 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011296 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011297 }
11298 break;
11299 }
11300 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011301 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011302 }
11303
Victor Stinner770e19e2012-10-04 22:59:45 +020011304 if (len1 == len2)
11305 return 0;
11306 if (len1 < len2)
11307 return -1;
11308 else
11309 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011310
11311#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011312}
11313
Benjamin Peterson621b4302016-09-09 13:54:34 -070011314static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011315unicode_compare_eq(PyObject *str1, PyObject *str2)
11316{
11317 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011318 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011319 Py_ssize_t len;
11320 int cmp;
11321
Victor Stinnere5567ad2012-10-23 02:48:49 +020011322 len = PyUnicode_GET_LENGTH(str1);
11323 if (PyUnicode_GET_LENGTH(str2) != len)
11324 return 0;
11325 kind = PyUnicode_KIND(str1);
11326 if (PyUnicode_KIND(str2) != kind)
11327 return 0;
11328 data1 = PyUnicode_DATA(str1);
11329 data2 = PyUnicode_DATA(str2);
11330
11331 cmp = memcmp(data1, data2, len * kind);
11332 return (cmp == 0);
11333}
11334
11335
Alexander Belopolsky40018472011-02-26 01:02:56 +000011336int
11337PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11340 if (PyUnicode_READY(left) == -1 ||
11341 PyUnicode_READY(right) == -1)
11342 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011343
11344 /* a string is equal to itself */
11345 if (left == right)
11346 return 0;
11347
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011348 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011350 PyErr_Format(PyExc_TypeError,
11351 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011352 Py_TYPE(left)->tp_name,
11353 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354 return -1;
11355}
11356
Martin v. Löwis5b222132007-06-10 09:51:05 +000011357int
11358PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 Py_ssize_t i;
11361 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011363 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364
Victor Stinner910337b2011-10-03 03:20:16 +020011365 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011366 if (!PyUnicode_IS_READY(uni)) {
11367 const wchar_t *ws = _PyUnicode_WSTR(uni);
11368 /* Compare Unicode string and source character set string */
11369 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11370 if (chr != ustr[i])
11371 return (chr < ustr[i]) ? -1 : 1;
11372 }
11373 /* This check keeps Python strings that end in '\0' from comparing equal
11374 to C strings identical up to that point. */
11375 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11376 return 1; /* uni is longer */
11377 if (ustr[i])
11378 return -1; /* str is longer */
11379 return 0;
11380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011382 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011383 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011384 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011385 size_t len, len2 = strlen(str);
11386 int cmp;
11387
11388 len = Py_MIN(len1, len2);
11389 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011390 if (cmp != 0) {
11391 if (cmp < 0)
11392 return -1;
11393 else
11394 return 1;
11395 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011396 if (len1 > len2)
11397 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011398 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011399 return -1; /* str is longer */
11400 return 0;
11401 }
11402 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011403 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011404 /* Compare Unicode string and source character set string */
11405 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011406 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011407 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11408 /* This check keeps Python strings that end in '\0' from comparing equal
11409 to C strings identical up to that point. */
11410 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11411 return 1; /* uni is longer */
11412 if (str[i])
11413 return -1; /* str is longer */
11414 return 0;
11415 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011416}
11417
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011418static int
11419non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11420{
11421 size_t i, len;
11422 const wchar_t *p;
11423 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11424 if (strlen(str) != len)
11425 return 0;
11426 p = _PyUnicode_WSTR(unicode);
11427 assert(p);
11428 for (i = 0; i < len; i++) {
11429 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011430 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011431 return 0;
11432 }
11433 return 1;
11434}
11435
11436int
11437_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11438{
11439 size_t len;
11440 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011441 assert(str);
11442#ifndef NDEBUG
11443 for (const char *p = str; *p; p++) {
11444 assert((unsigned char)*p < 128);
11445 }
11446#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011447 if (PyUnicode_READY(unicode) == -1) {
11448 /* Memory error or bad data */
11449 PyErr_Clear();
11450 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11451 }
11452 if (!PyUnicode_IS_ASCII(unicode))
11453 return 0;
11454 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11455 return strlen(str) == len &&
11456 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11457}
11458
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011459int
11460_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11461{
11462 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011463
11464 assert(_PyUnicode_CHECK(left));
11465 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011466#ifndef NDEBUG
11467 for (const char *p = right->string; *p; p++) {
11468 assert((unsigned char)*p < 128);
11469 }
11470#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011471
11472 if (PyUnicode_READY(left) == -1) {
11473 /* memory error or bad data */
11474 PyErr_Clear();
11475 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11476 }
11477
11478 if (!PyUnicode_IS_ASCII(left))
11479 return 0;
11480
11481 right_uni = _PyUnicode_FromId(right); /* borrowed */
11482 if (right_uni == NULL) {
11483 /* memory error or bad data */
11484 PyErr_Clear();
11485 return _PyUnicode_EqualToASCIIString(left, right->string);
11486 }
11487
11488 if (left == right_uni)
11489 return 1;
11490
11491 if (PyUnicode_CHECK_INTERNED(left))
11492 return 0;
11493
Victor Stinner607b1022020-05-05 18:50:30 +020011494#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011495 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011496 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011497 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11498 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011499#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011500
11501 return unicode_compare_eq(left, right_uni);
11502}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011503
Alexander Belopolsky40018472011-02-26 01:02:56 +000011504PyObject *
11505PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011506{
11507 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011508
Victor Stinnere5567ad2012-10-23 02:48:49 +020011509 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11510 Py_RETURN_NOTIMPLEMENTED;
11511
11512 if (PyUnicode_READY(left) == -1 ||
11513 PyUnicode_READY(right) == -1)
11514 return NULL;
11515
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011516 if (left == right) {
11517 switch (op) {
11518 case Py_EQ:
11519 case Py_LE:
11520 case Py_GE:
11521 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011522 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011523 case Py_NE:
11524 case Py_LT:
11525 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011526 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011527 default:
11528 PyErr_BadArgument();
11529 return NULL;
11530 }
11531 }
11532 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011533 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011534 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011535 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011536 }
11537 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011538 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011539 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011540 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011541}
11542
Alexander Belopolsky40018472011-02-26 01:02:56 +000011543int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011544_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11545{
11546 return unicode_eq(aa, bb);
11547}
11548
11549int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011550PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011551{
Victor Stinner77282cb2013-04-14 19:22:47 +020011552 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011553 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011555 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011556
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011557 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011559 "'in <string>' requires string as left operand, not %.100s",
11560 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011561 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011562 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011563 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011564 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011565 if (ensure_unicode(str) < 0)
11566 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011569 kind2 = PyUnicode_KIND(substr);
11570 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011571 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011573 len2 = PyUnicode_GET_LENGTH(substr);
11574 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011575 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011576 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011577 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011578 if (len2 == 1) {
11579 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11580 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011581 return result;
11582 }
11583 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011584 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011585 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011586 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588
Victor Stinner77282cb2013-04-14 19:22:47 +020011589 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 case PyUnicode_1BYTE_KIND:
11591 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11592 break;
11593 case PyUnicode_2BYTE_KIND:
11594 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11595 break;
11596 case PyUnicode_4BYTE_KIND:
11597 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11598 break;
11599 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011600 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011602
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011603 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011604 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011605 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606
Guido van Rossum403d68b2000-03-13 15:55:09 +000011607 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011608}
11609
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610/* Concat to string or Unicode object giving a new Unicode object. */
11611
Alexander Belopolsky40018472011-02-26 01:02:56 +000011612PyObject *
11613PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011615 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011616 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011617 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011619 if (ensure_unicode(left) < 0)
11620 return NULL;
11621
11622 if (!PyUnicode_Check(right)) {
11623 PyErr_Format(PyExc_TypeError,
11624 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011625 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011626 return NULL;
11627 }
11628 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011629 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
11631 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011632 PyObject *empty = unicode_get_empty(); // Borrowed reference
11633 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011634 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011635 }
11636 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011637 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011638 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011640 left_len = PyUnicode_GET_LENGTH(left);
11641 right_len = PyUnicode_GET_LENGTH(right);
11642 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011643 PyErr_SetString(PyExc_OverflowError,
11644 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011645 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011646 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011647 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011648
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011649 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11650 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011651 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011654 result = PyUnicode_New(new_len, maxchar);
11655 if (result == NULL)
11656 return NULL;
11657 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11658 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11659 assert(_PyUnicode_CheckConsistency(result, 1));
11660 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661}
11662
Walter Dörwald1ab83302007-05-18 17:15:44 +000011663void
Victor Stinner23e56682011-10-03 03:54:37 +020011664PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011665{
Victor Stinner23e56682011-10-03 03:54:37 +020011666 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011667 Py_UCS4 maxchar, maxchar2;
11668 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011669
11670 if (p_left == NULL) {
11671 if (!PyErr_Occurred())
11672 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011673 return;
11674 }
Victor Stinner23e56682011-10-03 03:54:37 +020011675 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011676 if (right == NULL || left == NULL
11677 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011678 if (!PyErr_Occurred())
11679 PyErr_BadInternalCall();
11680 goto error;
11681 }
11682
Benjamin Petersonbac79492012-01-14 13:34:47 -050011683 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011684 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011685 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011686 goto error;
11687
Victor Stinner488fa492011-12-12 00:01:39 +010011688 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011689 PyObject *empty = unicode_get_empty(); // Borrowed reference
11690 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011691 Py_DECREF(left);
11692 Py_INCREF(right);
11693 *p_left = right;
11694 return;
11695 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011696 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011697 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011698 }
Victor Stinner488fa492011-12-12 00:01:39 +010011699
11700 left_len = PyUnicode_GET_LENGTH(left);
11701 right_len = PyUnicode_GET_LENGTH(right);
11702 if (left_len > PY_SSIZE_T_MAX - right_len) {
11703 PyErr_SetString(PyExc_OverflowError,
11704 "strings are too large to concat");
11705 goto error;
11706 }
11707 new_len = left_len + right_len;
11708
11709 if (unicode_modifiable(left)
11710 && PyUnicode_CheckExact(right)
11711 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011712 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11713 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011714 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011715 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011716 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11717 {
11718 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011719 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011720 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011721
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011722 /* copy 'right' into the newly allocated area of 'left' */
11723 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011724 }
Victor Stinner488fa492011-12-12 00:01:39 +010011725 else {
11726 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11727 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011728 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011729
Victor Stinner488fa492011-12-12 00:01:39 +010011730 /* Concat the two Unicode strings */
11731 res = PyUnicode_New(new_len, maxchar);
11732 if (res == NULL)
11733 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011734 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11735 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011736 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011737 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011738 }
11739 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011740 return;
11741
11742error:
Victor Stinner488fa492011-12-12 00:01:39 +010011743 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011744}
11745
11746void
11747PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 PyUnicode_Append(pleft, right);
11750 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011751}
11752
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011753/*
11754Wraps stringlib_parse_args_finds() and additionally ensures that the
11755first argument is a unicode object.
11756*/
11757
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011758static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011759parse_args_finds_unicode(const char * function_name, PyObject *args,
11760 PyObject **substring,
11761 Py_ssize_t *start, Py_ssize_t *end)
11762{
11763 if(stringlib_parse_args_finds(function_name, args, substring,
11764 start, end)) {
11765 if (ensure_unicode(*substring) < 0)
11766 return 0;
11767 return 1;
11768 }
11769 return 0;
11770}
11771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011772PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011775Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011776string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
11779static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011780unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011782 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011783 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011784 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011786 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011787 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011790 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 kind1 = PyUnicode_KIND(self);
11794 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011795 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011796 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 len1 = PyUnicode_GET_LENGTH(self);
11799 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011801 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011802 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011803
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011804 buf1 = PyUnicode_DATA(self);
11805 buf2 = PyUnicode_DATA(substring);
11806 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011807 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011808 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011809 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011810 }
11811 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 case PyUnicode_1BYTE_KIND:
11813 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011814 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 buf2, len2, PY_SSIZE_T_MAX
11816 );
11817 break;
11818 case PyUnicode_2BYTE_KIND:
11819 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011820 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 buf2, len2, PY_SSIZE_T_MAX
11822 );
11823 break;
11824 case PyUnicode_4BYTE_KIND:
11825 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011826 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 buf2, len2, PY_SSIZE_T_MAX
11828 );
11829 break;
11830 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011831 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 }
11833
11834 result = PyLong_FromSsize_t(iresult);
11835
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011836 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011837 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011838 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 return result;
11841}
11842
INADA Naoki3ae20562017-01-16 20:41:20 +090011843/*[clinic input]
11844str.encode as unicode_encode
11845
11846 encoding: str(c_default="NULL") = 'utf-8'
11847 The encoding in which to encode the string.
11848 errors: str(c_default="NULL") = 'strict'
11849 The error handling scheme to use for encoding errors.
11850 The default is 'strict' meaning that encoding errors raise a
11851 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11852 'xmlcharrefreplace' as well as any other name registered with
11853 codecs.register_error that can handle UnicodeEncodeErrors.
11854
11855Encode the string using the codec registered for encoding.
11856[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
11858static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011859unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011860/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011862 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011863}
11864
INADA Naoki3ae20562017-01-16 20:41:20 +090011865/*[clinic input]
11866str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867
INADA Naoki3ae20562017-01-16 20:41:20 +090011868 tabsize: int = 8
11869
11870Return a copy where all tab characters are expanded using spaces.
11871
11872If tabsize is not given, a tab size of 8 characters is assumed.
11873[clinic start generated code]*/
11874
11875static PyObject *
11876unicode_expandtabs_impl(PyObject *self, int tabsize)
11877/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011879 Py_ssize_t i, j, line_pos, src_len, incr;
11880 Py_UCS4 ch;
11881 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011882 const void *src_data;
11883 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011884 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011885 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
Antoine Pitrou22425222011-10-04 19:10:51 +020011887 if (PyUnicode_READY(self) == -1)
11888 return NULL;
11889
Thomas Wouters7e474022000-07-16 12:04:32 +000011890 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011891 src_len = PyUnicode_GET_LENGTH(self);
11892 i = j = line_pos = 0;
11893 kind = PyUnicode_KIND(self);
11894 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011895 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011896 for (; i < src_len; i++) {
11897 ch = PyUnicode_READ(kind, src_data, i);
11898 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011899 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011901 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011903 goto overflow;
11904 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011906 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011910 goto overflow;
11911 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011913 if (ch == '\n' || ch == '\r')
11914 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011916 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011917 if (!found)
11918 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011919
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011921 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 if (!u)
11923 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011924 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925
Antoine Pitroue71d5742011-10-04 15:55:09 +020011926 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927
Antoine Pitroue71d5742011-10-04 15:55:09 +020011928 for (; i < src_len; i++) {
11929 ch = PyUnicode_READ(kind, src_data, i);
11930 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011932 incr = tabsize - (line_pos % tabsize);
11933 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011934 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011935 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011937 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011939 line_pos++;
11940 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011941 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011942 if (ch == '\n' || ch == '\r')
11943 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011945 }
11946 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011947 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011948
Antoine Pitroue71d5742011-10-04 15:55:09 +020011949 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011950 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952}
11953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011954PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956\n\
11957Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011958such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959arguments start and end are interpreted as in slice notation.\n\
11960\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011961Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962
11963static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011966 /* initialize variables to prevent gcc warning */
11967 PyObject *substring = NULL;
11968 Py_ssize_t start = 0;
11969 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011970 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011972 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011975 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011978 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 if (result == -2)
11981 return NULL;
11982
Christian Heimes217cfd12007-12-02 14:31:20 +000011983 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984}
11985
11986static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011987unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011989 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011990 enum PyUnicode_Kind kind;
11991 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011992
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011993 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011994 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011996 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011997 if (PyUnicode_READY(self) == -1) {
11998 return NULL;
11999 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012000 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12001 PyErr_SetString(PyExc_IndexError, "string index out of range");
12002 return NULL;
12003 }
12004 kind = PyUnicode_KIND(self);
12005 data = PyUnicode_DATA(self);
12006 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012007 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008}
12009
Guido van Rossumc2504932007-09-18 19:42:40 +000012010/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012011 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012012static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012013unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012015 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012016
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012017#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012018 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012019#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 if (_PyUnicode_HASH(self) != -1)
12021 return _PyUnicode_HASH(self);
12022 if (PyUnicode_READY(self) == -1)
12023 return -1;
animalizea1d14252019-01-02 20:16:06 +080012024
Christian Heimes985ecdc2013-11-20 11:46:18 +010012025 x = _Py_HashBytes(PyUnicode_DATA(self),
12026 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012028 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029}
12030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012031PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033\n\
oldkaa0735f2018-02-02 16:52:55 +080012034Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012035such that sub is contained within S[start:end]. Optional\n\
12036arguments start and end are interpreted as in slice notation.\n\
12037\n\
12038Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039
12040static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012043 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012044 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012045 PyObject *substring = NULL;
12046 Py_ssize_t start = 0;
12047 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012049 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012052 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012055 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 if (result == -2)
12058 return NULL;
12059
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 if (result < 0) {
12061 PyErr_SetString(PyExc_ValueError, "substring not found");
12062 return NULL;
12063 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012064
Christian Heimes217cfd12007-12-02 14:31:20 +000012065 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
INADA Naoki3ae20562017-01-16 20:41:20 +090012068/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012069str.isascii as unicode_isascii
12070
12071Return True if all characters in the string are ASCII, False otherwise.
12072
12073ASCII characters have code points in the range U+0000-U+007F.
12074Empty string is ASCII too.
12075[clinic start generated code]*/
12076
12077static PyObject *
12078unicode_isascii_impl(PyObject *self)
12079/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12080{
12081 if (PyUnicode_READY(self) == -1) {
12082 return NULL;
12083 }
12084 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12085}
12086
12087/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012088str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089
INADA Naoki3ae20562017-01-16 20:41:20 +090012090Return True if the string is a lowercase string, False otherwise.
12091
12092A string is lowercase if all cased characters in the string are lowercase and
12093there is at least one cased character in the string.
12094[clinic start generated code]*/
12095
12096static PyObject *
12097unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012098/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 Py_ssize_t i, length;
12101 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012102 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 int cased;
12104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (PyUnicode_READY(self) == -1)
12106 return NULL;
12107 length = PyUnicode_GET_LENGTH(self);
12108 kind = PyUnicode_KIND(self);
12109 data = PyUnicode_DATA(self);
12110
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 if (length == 1)
12113 return PyBool_FromLong(
12114 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012116 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012118 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012119
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 for (i = 0; i < length; i++) {
12122 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012123
Benjamin Peterson29060642009-01-31 22:14:21 +000012124 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012125 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012126 else if (!cased && Py_UNICODE_ISLOWER(ch))
12127 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012129 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130}
12131
INADA Naoki3ae20562017-01-16 20:41:20 +090012132/*[clinic input]
12133str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
INADA Naoki3ae20562017-01-16 20:41:20 +090012135Return True if the string is an uppercase string, False otherwise.
12136
12137A string is uppercase if all cased characters in the string are uppercase and
12138there is at least one cased character in the string.
12139[clinic start generated code]*/
12140
12141static PyObject *
12142unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012143/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 Py_ssize_t i, length;
12146 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012147 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148 int cased;
12149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (PyUnicode_READY(self) == -1)
12151 return NULL;
12152 length = PyUnicode_GET_LENGTH(self);
12153 kind = PyUnicode_KIND(self);
12154 data = PyUnicode_DATA(self);
12155
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (length == 1)
12158 return PyBool_FromLong(
12159 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012161 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012163 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012164
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 for (i = 0; i < length; i++) {
12167 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012168
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012170 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012171 else if (!cased && Py_UNICODE_ISUPPER(ch))
12172 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012174 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175}
12176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177/*[clinic input]
12178str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
INADA Naoki3ae20562017-01-16 20:41:20 +090012180Return True if the string is a title-cased string, False otherwise.
12181
12182In a title-cased string, upper- and title-case characters may only
12183follow uncased characters and lowercase characters only cased ones.
12184[clinic start generated code]*/
12185
12186static PyObject *
12187unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012188/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 Py_ssize_t i, length;
12191 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012192 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 int cased, previous_is_cased;
12194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 if (PyUnicode_READY(self) == -1)
12196 return NULL;
12197 length = PyUnicode_GET_LENGTH(self);
12198 kind = PyUnicode_KIND(self);
12199 data = PyUnicode_DATA(self);
12200
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (length == 1) {
12203 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12204 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12205 (Py_UNICODE_ISUPPER(ch) != 0));
12206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012208 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012210 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012211
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212 cased = 0;
12213 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 for (i = 0; i < length; i++) {
12215 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012216
Benjamin Peterson29060642009-01-31 22:14:21 +000012217 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12218 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012219 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 previous_is_cased = 1;
12221 cased = 1;
12222 }
12223 else if (Py_UNICODE_ISLOWER(ch)) {
12224 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012225 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 previous_is_cased = 1;
12227 cased = 1;
12228 }
12229 else
12230 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012232 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
INADA Naoki3ae20562017-01-16 20:41:20 +090012235/*[clinic input]
12236str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237
INADA Naoki3ae20562017-01-16 20:41:20 +090012238Return True if the string is a whitespace string, False otherwise.
12239
12240A string is whitespace if all characters in the string are whitespace and there
12241is at least one character in the string.
12242[clinic start generated code]*/
12243
12244static PyObject *
12245unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012246/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 Py_ssize_t i, length;
12249 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012250 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251
12252 if (PyUnicode_READY(self) == -1)
12253 return NULL;
12254 length = PyUnicode_GET_LENGTH(self);
12255 kind = PyUnicode_KIND(self);
12256 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 if (length == 1)
12260 return PyBool_FromLong(
12261 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012263 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012265 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 for (i = 0; i < length; i++) {
12268 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012269 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012270 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273}
12274
INADA Naoki3ae20562017-01-16 20:41:20 +090012275/*[clinic input]
12276str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012277
INADA Naoki3ae20562017-01-16 20:41:20 +090012278Return True if the string is an alphabetic string, False otherwise.
12279
12280A string is alphabetic if all characters in the string are alphabetic and there
12281is at least one character in the string.
12282[clinic start generated code]*/
12283
12284static PyObject *
12285unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012286/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 Py_ssize_t i, length;
12289 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012290 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291
12292 if (PyUnicode_READY(self) == -1)
12293 return NULL;
12294 length = PyUnicode_GET_LENGTH(self);
12295 kind = PyUnicode_KIND(self);
12296 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012297
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012298 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 if (length == 1)
12300 return PyBool_FromLong(
12301 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012302
12303 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012305 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 for (i = 0; i < length; i++) {
12308 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012309 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012310 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012311 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012312}
12313
INADA Naoki3ae20562017-01-16 20:41:20 +090012314/*[clinic input]
12315str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012316
INADA Naoki3ae20562017-01-16 20:41:20 +090012317Return True if the string is an alpha-numeric string, False otherwise.
12318
12319A string is alpha-numeric if all characters in the string are alpha-numeric and
12320there is at least one character in the string.
12321[clinic start generated code]*/
12322
12323static PyObject *
12324unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012325/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012326{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012328 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 Py_ssize_t len, i;
12330
12331 if (PyUnicode_READY(self) == -1)
12332 return NULL;
12333
12334 kind = PyUnicode_KIND(self);
12335 data = PyUnicode_DATA(self);
12336 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012337
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012338 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 if (len == 1) {
12340 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12341 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12342 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012343
12344 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012346 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 for (i = 0; i < len; i++) {
12349 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012350 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012351 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012352 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012353 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012354}
12355
INADA Naoki3ae20562017-01-16 20:41:20 +090012356/*[clinic input]
12357str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
INADA Naoki3ae20562017-01-16 20:41:20 +090012359Return True if the string is a decimal string, False otherwise.
12360
12361A string is a decimal string if all characters in the string are decimal and
12362there is at least one character in the string.
12363[clinic start generated code]*/
12364
12365static PyObject *
12366unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012367/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 Py_ssize_t i, length;
12370 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012371 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372
12373 if (PyUnicode_READY(self) == -1)
12374 return NULL;
12375 length = PyUnicode_GET_LENGTH(self);
12376 kind = PyUnicode_KIND(self);
12377 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 if (length == 1)
12381 return PyBool_FromLong(
12382 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012384 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012386 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 for (i = 0; i < length; i++) {
12389 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012390 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012392 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393}
12394
INADA Naoki3ae20562017-01-16 20:41:20 +090012395/*[clinic input]
12396str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397
INADA Naoki3ae20562017-01-16 20:41:20 +090012398Return True if the string is a digit string, False otherwise.
12399
12400A string is a digit string if all characters in the string are digits and there
12401is at least one character in the string.
12402[clinic start generated code]*/
12403
12404static PyObject *
12405unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012406/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 Py_ssize_t i, length;
12409 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012410 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411
12412 if (PyUnicode_READY(self) == -1)
12413 return NULL;
12414 length = PyUnicode_GET_LENGTH(self);
12415 kind = PyUnicode_KIND(self);
12416 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 if (length == 1) {
12420 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12421 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012424 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012426 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 for (i = 0; i < length; i++) {
12429 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012430 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012432 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433}
12434
INADA Naoki3ae20562017-01-16 20:41:20 +090012435/*[clinic input]
12436str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437
INADA Naoki3ae20562017-01-16 20:41:20 +090012438Return True if the string is a numeric string, False otherwise.
12439
12440A string is numeric if all characters in the string are numeric and there is at
12441least one character in the string.
12442[clinic start generated code]*/
12443
12444static PyObject *
12445unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012446/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 Py_ssize_t i, length;
12449 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012450 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451
12452 if (PyUnicode_READY(self) == -1)
12453 return NULL;
12454 length = PyUnicode_GET_LENGTH(self);
12455 kind = PyUnicode_KIND(self);
12456 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 if (length == 1)
12460 return PyBool_FromLong(
12461 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012463 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012465 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 for (i = 0; i < length; i++) {
12468 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012469 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012471 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472}
12473
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012474Py_ssize_t
12475_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012478 if (PyUnicode_READY(self) == -1)
12479 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012480
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012481 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012482 if (len == 0) {
12483 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 }
12486
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012487 int kind = PyUnicode_KIND(self);
12488 const void *data = PyUnicode_DATA(self);
12489 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012490 /* PEP 3131 says that the first character must be in
12491 XID_Start and subsequent characters in XID_Continue,
12492 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012493 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012494 letters, digits, underscore). However, given the current
12495 definition of XID_Start and XID_Continue, it is sufficient
12496 to check just for these, except that _ must be allowed
12497 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012498 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012499 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012500 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012501
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012502 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012503 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012504 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012505 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012506 }
12507 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012508 return i;
12509}
12510
12511int
12512PyUnicode_IsIdentifier(PyObject *self)
12513{
12514 if (PyUnicode_IS_READY(self)) {
12515 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12516 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12517 /* an empty string is not a valid identifier */
12518 return len && i == len;
12519 }
12520 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012521_Py_COMP_DIAG_PUSH
12522_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012523 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012524 if (len == 0) {
12525 /* an empty string is not a valid identifier */
12526 return 0;
12527 }
12528
12529 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012530 Py_UCS4 ch = wstr[i++];
12531#if SIZEOF_WCHAR_T == 2
12532 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12533 && i < len
12534 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12535 {
12536 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12537 i++;
12538 }
12539#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012540 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12541 return 0;
12542 }
12543
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012544 while (i < len) {
12545 ch = wstr[i++];
12546#if SIZEOF_WCHAR_T == 2
12547 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12548 && i < len
12549 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12550 {
12551 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12552 i++;
12553 }
12554#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012555 if (!_PyUnicode_IsXidContinue(ch)) {
12556 return 0;
12557 }
12558 }
12559 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012560_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012561 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012562}
12563
INADA Naoki3ae20562017-01-16 20:41:20 +090012564/*[clinic input]
12565str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012566
INADA Naoki3ae20562017-01-16 20:41:20 +090012567Return True if the string is a valid Python identifier, False otherwise.
12568
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012569Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012570such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012571[clinic start generated code]*/
12572
12573static PyObject *
12574unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012575/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012576{
12577 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12578}
12579
INADA Naoki3ae20562017-01-16 20:41:20 +090012580/*[clinic input]
12581str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012582
INADA Naoki3ae20562017-01-16 20:41:20 +090012583Return True if the string is printable, False otherwise.
12584
12585A string is printable if all of its characters are considered printable in
12586repr() or if it is empty.
12587[clinic start generated code]*/
12588
12589static PyObject *
12590unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012591/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 Py_ssize_t i, length;
12594 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012595 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596
12597 if (PyUnicode_READY(self) == -1)
12598 return NULL;
12599 length = PyUnicode_GET_LENGTH(self);
12600 kind = PyUnicode_KIND(self);
12601 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012602
12603 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 if (length == 1)
12605 return PyBool_FromLong(
12606 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 for (i = 0; i < length; i++) {
12609 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012610 Py_RETURN_FALSE;
12611 }
12612 }
12613 Py_RETURN_TRUE;
12614}
12615
INADA Naoki3ae20562017-01-16 20:41:20 +090012616/*[clinic input]
12617str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618
INADA Naoki3ae20562017-01-16 20:41:20 +090012619 iterable: object
12620 /
12621
12622Concatenate any number of strings.
12623
Martin Panter91a88662017-01-24 00:30:06 +000012624The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012625The result is returned as a new string.
12626
12627Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12628[clinic start generated code]*/
12629
12630static PyObject *
12631unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012632/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633{
INADA Naoki3ae20562017-01-16 20:41:20 +090012634 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635}
12636
Martin v. Löwis18e16552006-02-15 17:27:45 +000012637static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012638unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 if (PyUnicode_READY(self) == -1)
12641 return -1;
12642 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643}
12644
INADA Naoki3ae20562017-01-16 20:41:20 +090012645/*[clinic input]
12646str.ljust as unicode_ljust
12647
12648 width: Py_ssize_t
12649 fillchar: Py_UCS4 = ' '
12650 /
12651
12652Return a left-justified string of length width.
12653
12654Padding is done using the specified fill character (default is a space).
12655[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
12657static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012658unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12659/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012661 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012662 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
Victor Stinnerc4b49542011-12-11 22:44:26 +010012664 if (PyUnicode_GET_LENGTH(self) >= width)
12665 return unicode_result_unchanged(self);
12666
12667 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668}
12669
INADA Naoki3ae20562017-01-16 20:41:20 +090012670/*[clinic input]
12671str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672
INADA Naoki3ae20562017-01-16 20:41:20 +090012673Return a copy of the string converted to lowercase.
12674[clinic start generated code]*/
12675
12676static PyObject *
12677unicode_lower_impl(PyObject *self)
12678/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012680 if (PyUnicode_READY(self) == -1)
12681 return NULL;
12682 if (PyUnicode_IS_ASCII(self))
12683 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012684 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685}
12686
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012687#define LEFTSTRIP 0
12688#define RIGHTSTRIP 1
12689#define BOTHSTRIP 2
12690
12691/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012692static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012693
INADA Naoki3ae20562017-01-16 20:41:20 +090012694#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012695
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012696/* externally visible for str.strip(unicode) */
12697PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012698_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012699{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012700 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 int kind;
12702 Py_ssize_t i, j, len;
12703 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012704 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12707 return NULL;
12708
12709 kind = PyUnicode_KIND(self);
12710 data = PyUnicode_DATA(self);
12711 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012712 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12714 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012715 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012716
Benjamin Peterson14339b62009-01-31 16:36:08 +000012717 i = 0;
12718 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012719 while (i < len) {
12720 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12721 if (!BLOOM(sepmask, ch))
12722 break;
12723 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12724 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 i++;
12726 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012727 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012728
Benjamin Peterson14339b62009-01-31 16:36:08 +000012729 j = len;
12730 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012731 j--;
12732 while (j >= i) {
12733 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12734 if (!BLOOM(sepmask, ch))
12735 break;
12736 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12737 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012739 }
12740
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012742 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012743
Victor Stinner7931d9a2011-11-04 00:22:48 +010012744 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745}
12746
12747PyObject*
12748PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12749{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012750 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012752 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753
Victor Stinnerde636f32011-10-01 03:55:54 +020012754 if (PyUnicode_READY(self) == -1)
12755 return NULL;
12756
Victor Stinner684d5fd2012-05-03 02:32:34 +020012757 length = PyUnicode_GET_LENGTH(self);
12758 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012759
Victor Stinner684d5fd2012-05-03 02:32:34 +020012760 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012761 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762
Victor Stinnerde636f32011-10-01 03:55:54 +020012763 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012764 PyErr_SetString(PyExc_IndexError, "string index out of range");
12765 return NULL;
12766 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012767 if (start >= length || end < start)
12768 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012769
Victor Stinner684d5fd2012-05-03 02:32:34 +020012770 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012771 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012772 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012773 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012774 }
12775 else {
12776 kind = PyUnicode_KIND(self);
12777 data = PyUnicode_1BYTE_DATA(self);
12778 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012779 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012780 length);
12781 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783
12784static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012785do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012787 Py_ssize_t len, i, j;
12788
12789 if (PyUnicode_READY(self) == -1)
12790 return NULL;
12791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012793
Victor Stinnercc7af722013-04-09 22:39:24 +020012794 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012795 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012796
12797 i = 0;
12798 if (striptype != RIGHTSTRIP) {
12799 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012800 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012801 if (!_Py_ascii_whitespace[ch])
12802 break;
12803 i++;
12804 }
12805 }
12806
12807 j = len;
12808 if (striptype != LEFTSTRIP) {
12809 j--;
12810 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012811 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012812 if (!_Py_ascii_whitespace[ch])
12813 break;
12814 j--;
12815 }
12816 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817 }
12818 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012819 else {
12820 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012821 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012822
Victor Stinnercc7af722013-04-09 22:39:24 +020012823 i = 0;
12824 if (striptype != RIGHTSTRIP) {
12825 while (i < len) {
12826 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12827 if (!Py_UNICODE_ISSPACE(ch))
12828 break;
12829 i++;
12830 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012831 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012832
12833 j = len;
12834 if (striptype != LEFTSTRIP) {
12835 j--;
12836 while (j >= i) {
12837 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12838 if (!Py_UNICODE_ISSPACE(ch))
12839 break;
12840 j--;
12841 }
12842 j++;
12843 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012844 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012845
Victor Stinner7931d9a2011-11-04 00:22:48 +010012846 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847}
12848
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012849
12850static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012851do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012852{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012853 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 if (PyUnicode_Check(sep))
12855 return _PyUnicode_XStrip(self, striptype, sep);
12856 else {
12857 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 "%s arg must be None or str",
12859 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012860 return NULL;
12861 }
12862 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012863
Benjamin Peterson14339b62009-01-31 16:36:08 +000012864 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012865}
12866
12867
INADA Naoki3ae20562017-01-16 20:41:20 +090012868/*[clinic input]
12869str.strip as unicode_strip
12870
12871 chars: object = None
12872 /
12873
Zachary Ware09895c22019-10-09 16:09:00 -050012874Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012875
12876If chars is given and not None, remove characters in chars instead.
12877[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012878
12879static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012880unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012881/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012882{
INADA Naoki3ae20562017-01-16 20:41:20 +090012883 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012884}
12885
12886
INADA Naoki3ae20562017-01-16 20:41:20 +090012887/*[clinic input]
12888str.lstrip as unicode_lstrip
12889
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012890 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012891 /
12892
12893Return a copy of the string with leading whitespace removed.
12894
12895If chars is given and not None, remove characters in chars instead.
12896[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012897
12898static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012899unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012900/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012901{
INADA Naoki3ae20562017-01-16 20:41:20 +090012902 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012903}
12904
12905
INADA Naoki3ae20562017-01-16 20:41:20 +090012906/*[clinic input]
12907str.rstrip as unicode_rstrip
12908
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012909 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012910 /
12911
12912Return a copy of the string with trailing whitespace removed.
12913
12914If chars is given and not None, remove characters in chars instead.
12915[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012916
12917static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012918unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012919/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012920{
INADA Naoki3ae20562017-01-16 20:41:20 +090012921 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012922}
12923
12924
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012926unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012928 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930
Serhiy Storchaka05997252013-01-26 12:14:02 +020012931 if (len < 1)
12932 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933
Victor Stinnerc4b49542011-12-11 22:44:26 +010012934 /* no repeat, return original string */
12935 if (len == 1)
12936 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012937
Benjamin Petersonbac79492012-01-14 13:34:47 -050012938 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 return NULL;
12940
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012941 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012942 PyErr_SetString(PyExc_OverflowError,
12943 "repeated string is too long");
12944 return NULL;
12945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012947
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012948 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949 if (!u)
12950 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012951 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012954 int kind = PyUnicode_KIND(str);
12955 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012956 if (kind == PyUnicode_1BYTE_KIND) {
12957 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012958 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012959 }
12960 else if (kind == PyUnicode_2BYTE_KIND) {
12961 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012962 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012963 ucs2[n] = fill_char;
12964 } else {
12965 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12966 assert(kind == PyUnicode_4BYTE_KIND);
12967 for (n = 0; n < len; ++n)
12968 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 }
12971 else {
12972 /* number of characters copied this far */
12973 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012974 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012976 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012978 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012980 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983 }
12984
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012985 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012986 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987}
12988
Alexander Belopolsky40018472011-02-26 01:02:56 +000012989PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012990PyUnicode_Replace(PyObject *str,
12991 PyObject *substr,
12992 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012993 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012995 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12996 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012998 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999}
13000
INADA Naoki3ae20562017-01-16 20:41:20 +090013001/*[clinic input]
13002str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003
INADA Naoki3ae20562017-01-16 20:41:20 +090013004 old: unicode
13005 new: unicode
13006 count: Py_ssize_t = -1
13007 Maximum number of occurrences to replace.
13008 -1 (the default value) means replace all occurrences.
13009 /
13010
13011Return a copy with all occurrences of substring old replaced by new.
13012
13013If the optional argument count is given, only the first count occurrences are
13014replaced.
13015[clinic start generated code]*/
13016
13017static PyObject *
13018unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13019 Py_ssize_t count)
13020/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013021{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013022 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013023 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013024 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025}
13026
sweeneydea81849b2020-04-22 17:05:48 -040013027/*[clinic input]
13028str.removeprefix as unicode_removeprefix
13029
13030 prefix: unicode
13031 /
13032
13033Return a str with the given prefix string removed if present.
13034
13035If the string starts with the prefix string, return string[len(prefix):].
13036Otherwise, return a copy of the original string.
13037[clinic start generated code]*/
13038
13039static PyObject *
13040unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13041/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13042{
13043 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13044 if (match == -1) {
13045 return NULL;
13046 }
13047 if (match) {
13048 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13049 PyUnicode_GET_LENGTH(self));
13050 }
13051 return unicode_result_unchanged(self);
13052}
13053
13054/*[clinic input]
13055str.removesuffix as unicode_removesuffix
13056
13057 suffix: unicode
13058 /
13059
13060Return a str with the given suffix string removed if present.
13061
13062If the string ends with the suffix string and that suffix is not empty,
13063return string[:-len(suffix)]. Otherwise, return a copy of the original
13064string.
13065[clinic start generated code]*/
13066
13067static PyObject *
13068unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13069/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13070{
13071 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13072 if (match == -1) {
13073 return NULL;
13074 }
13075 if (match) {
13076 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13077 - PyUnicode_GET_LENGTH(suffix));
13078 }
13079 return unicode_result_unchanged(self);
13080}
13081
Alexander Belopolsky40018472011-02-26 01:02:56 +000013082static PyObject *
13083unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013085 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086 Py_ssize_t isize;
13087 Py_ssize_t osize, squote, dquote, i, o;
13088 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013089 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013090 const void *idata;
13091 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013094 return NULL;
13095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 isize = PyUnicode_GET_LENGTH(unicode);
13097 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 /* Compute length of output, quote characters, and
13100 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013101 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 max = 127;
13103 squote = dquote = 0;
13104 ikind = PyUnicode_KIND(unicode);
13105 for (i = 0; i < isize; i++) {
13106 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013107 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013109 case '\'': squote++; break;
13110 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013112 incr = 2;
13113 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 default:
13115 /* Fast-path ASCII */
13116 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013117 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013119 ;
13120 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013123 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013125 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013127 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013129 if (osize > PY_SSIZE_T_MAX - incr) {
13130 PyErr_SetString(PyExc_OverflowError,
13131 "string is too long to generate repr");
13132 return NULL;
13133 }
13134 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013135 }
13136
13137 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013138 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013140 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 if (dquote)
13142 /* Both squote and dquote present. Use squote,
13143 and escape them */
13144 osize += squote;
13145 else
13146 quote = '"';
13147 }
Victor Stinner55c08782013-04-14 18:45:39 +020013148 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149
13150 repr = PyUnicode_New(osize, max);
13151 if (repr == NULL)
13152 return NULL;
13153 okind = PyUnicode_KIND(repr);
13154 odata = PyUnicode_DATA(repr);
13155
13156 PyUnicode_WRITE(okind, odata, 0, quote);
13157 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013158 if (unchanged) {
13159 _PyUnicode_FastCopyCharacters(repr, 1,
13160 unicode, 0,
13161 isize);
13162 }
13163 else {
13164 for (i = 0, o = 1; i < isize; i++) {
13165 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166
Victor Stinner55c08782013-04-14 18:45:39 +020013167 /* Escape quotes and backslashes */
13168 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013169 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013171 continue;
13172 }
13173
13174 /* Map special whitespace to '\t', \n', '\r' */
13175 if (ch == '\t') {
13176 PyUnicode_WRITE(okind, odata, o++, '\\');
13177 PyUnicode_WRITE(okind, odata, o++, 't');
13178 }
13179 else if (ch == '\n') {
13180 PyUnicode_WRITE(okind, odata, o++, '\\');
13181 PyUnicode_WRITE(okind, odata, o++, 'n');
13182 }
13183 else if (ch == '\r') {
13184 PyUnicode_WRITE(okind, odata, o++, '\\');
13185 PyUnicode_WRITE(okind, odata, o++, 'r');
13186 }
13187
13188 /* Map non-printable US ASCII to '\xhh' */
13189 else if (ch < ' ' || ch == 0x7F) {
13190 PyUnicode_WRITE(okind, odata, o++, '\\');
13191 PyUnicode_WRITE(okind, odata, o++, 'x');
13192 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13193 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13194 }
13195
13196 /* Copy ASCII characters as-is */
13197 else if (ch < 0x7F) {
13198 PyUnicode_WRITE(okind, odata, o++, ch);
13199 }
13200
13201 /* Non-ASCII characters */
13202 else {
13203 /* Map Unicode whitespace and control characters
13204 (categories Z* and C* except ASCII space)
13205 */
13206 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13207 PyUnicode_WRITE(okind, odata, o++, '\\');
13208 /* Map 8-bit characters to '\xhh' */
13209 if (ch <= 0xff) {
13210 PyUnicode_WRITE(okind, odata, o++, 'x');
13211 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13212 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13213 }
13214 /* Map 16-bit characters to '\uxxxx' */
13215 else if (ch <= 0xffff) {
13216 PyUnicode_WRITE(okind, odata, o++, 'u');
13217 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13218 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13219 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13220 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13221 }
13222 /* Map 21-bit characters to '\U00xxxxxx' */
13223 else {
13224 PyUnicode_WRITE(okind, odata, o++, 'U');
13225 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13226 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13227 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13228 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13229 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13230 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13231 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13232 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13233 }
13234 }
13235 /* Copy characters as-is */
13236 else {
13237 PyUnicode_WRITE(okind, odata, o++, ch);
13238 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013239 }
13240 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013243 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013244 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245}
13246
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013247PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249\n\
13250Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013251such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252arguments start and end are interpreted as in slice notation.\n\
13253\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013254Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255
13256static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013259 /* initialize variables to prevent gcc warning */
13260 PyObject *substring = NULL;
13261 Py_ssize_t start = 0;
13262 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013263 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013265 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013268 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013271 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273 if (result == -2)
13274 return NULL;
13275
Christian Heimes217cfd12007-12-02 14:31:20 +000013276 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277}
13278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013279PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013280 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013282Return the highest index in S where substring sub is found,\n\
13283such that sub is contained within S[start:end]. Optional\n\
13284arguments start and end are interpreted as in slice notation.\n\
13285\n\
13286Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287
13288static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013291 /* initialize variables to prevent gcc warning */
13292 PyObject *substring = NULL;
13293 Py_ssize_t start = 0;
13294 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013295 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013297 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013300 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013303 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 if (result == -2)
13306 return NULL;
13307
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308 if (result < 0) {
13309 PyErr_SetString(PyExc_ValueError, "substring not found");
13310 return NULL;
13311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312
Christian Heimes217cfd12007-12-02 14:31:20 +000013313 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314}
13315
INADA Naoki3ae20562017-01-16 20:41:20 +090013316/*[clinic input]
13317str.rjust as unicode_rjust
13318
13319 width: Py_ssize_t
13320 fillchar: Py_UCS4 = ' '
13321 /
13322
13323Return a right-justified string of length width.
13324
13325Padding is done using the specified fill character (default is a space).
13326[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
13328static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013329unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13330/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013332 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333 return NULL;
13334
Victor Stinnerc4b49542011-12-11 22:44:26 +010013335 if (PyUnicode_GET_LENGTH(self) >= width)
13336 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337
Victor Stinnerc4b49542011-12-11 22:44:26 +010013338 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339}
13340
Alexander Belopolsky40018472011-02-26 01:02:56 +000013341PyObject *
13342PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013344 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013347 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348}
13349
INADA Naoki3ae20562017-01-16 20:41:20 +090013350/*[clinic input]
13351str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352
INADA Naoki3ae20562017-01-16 20:41:20 +090013353 sep: object = None
13354 The delimiter according which to split the string.
13355 None (the default value) means split according to any whitespace,
13356 and discard empty strings from the result.
13357 maxsplit: Py_ssize_t = -1
13358 Maximum number of splits to do.
13359 -1 (the default value) means no limit.
13360
13361Return a list of the words in the string, using sep as the delimiter string.
13362[clinic start generated code]*/
13363
13364static PyObject *
13365unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13366/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367{
INADA Naoki3ae20562017-01-16 20:41:20 +090013368 if (sep == Py_None)
13369 return split(self, NULL, maxsplit);
13370 if (PyUnicode_Check(sep))
13371 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013372
Victor Stinner998b8062018-09-12 00:23:25 +020013373 PyErr_Format(PyExc_TypeError,
13374 "must be str or None, not %.100s",
13375 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377}
13378
Thomas Wouters477c8d52006-05-27 19:21:47 +000013379PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013380PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013381{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013382 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013383 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013384 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013386
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013387 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013389
Victor Stinner14f8f022011-10-05 20:58:25 +020013390 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013392 len1 = PyUnicode_GET_LENGTH(str_obj);
13393 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013394 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013395 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013396 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013397 }
13398 buf1 = PyUnicode_DATA(str_obj);
13399 buf2 = PyUnicode_DATA(sep_obj);
13400 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013401 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013402 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013406 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013408 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13409 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13410 else
13411 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013412 break;
13413 case PyUnicode_2BYTE_KIND:
13414 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13415 break;
13416 case PyUnicode_4BYTE_KIND:
13417 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13418 break;
13419 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013420 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013422
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013423 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013424 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013425 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013426
13427 return out;
13428}
13429
13430
13431PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013432PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013433{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013434 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013435 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013436 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013437 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013438
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013439 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013441
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013442 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013444 len1 = PyUnicode_GET_LENGTH(str_obj);
13445 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013446 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013447 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013448 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013449 }
13450 buf1 = PyUnicode_DATA(str_obj);
13451 buf2 = PyUnicode_DATA(sep_obj);
13452 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013453 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013454 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013455 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013457
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013458 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013459 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013460 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13461 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13462 else
13463 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013464 break;
13465 case PyUnicode_2BYTE_KIND:
13466 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13467 break;
13468 case PyUnicode_4BYTE_KIND:
13469 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13470 break;
13471 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013472 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013473 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013474
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013475 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013476 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013477 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013478
13479 return out;
13480}
13481
INADA Naoki3ae20562017-01-16 20:41:20 +090013482/*[clinic input]
13483str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013484
INADA Naoki3ae20562017-01-16 20:41:20 +090013485 sep: object
13486 /
13487
13488Partition the string into three parts using the given separator.
13489
13490This will search for the separator in the string. If the separator is found,
13491returns a 3-tuple containing the part before the separator, the separator
13492itself, and the part after it.
13493
13494If the separator is not found, returns a 3-tuple containing the original string
13495and two empty strings.
13496[clinic start generated code]*/
13497
13498static PyObject *
13499unicode_partition(PyObject *self, PyObject *sep)
13500/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013501{
INADA Naoki3ae20562017-01-16 20:41:20 +090013502 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013503}
13504
INADA Naoki3ae20562017-01-16 20:41:20 +090013505/*[clinic input]
13506str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013507
INADA Naoki3ae20562017-01-16 20:41:20 +090013508Partition the string into three parts using the given separator.
13509
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013510This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013511the separator is found, returns a 3-tuple containing the part before the
13512separator, the separator itself, and the part after it.
13513
13514If the separator is not found, returns a 3-tuple containing two empty strings
13515and the original string.
13516[clinic start generated code]*/
13517
13518static PyObject *
13519unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013520/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013521{
INADA Naoki3ae20562017-01-16 20:41:20 +090013522 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013523}
13524
Alexander Belopolsky40018472011-02-26 01:02:56 +000013525PyObject *
13526PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013527{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013528 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013529 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013530
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013531 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013532}
13533
INADA Naoki3ae20562017-01-16 20:41:20 +090013534/*[clinic input]
13535str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013536
INADA Naoki3ae20562017-01-16 20:41:20 +090013537Return a list of the words in the string, using sep as the delimiter string.
13538
13539Splits are done starting at the end of the string and working to the front.
13540[clinic start generated code]*/
13541
13542static PyObject *
13543unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13544/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013545{
INADA Naoki3ae20562017-01-16 20:41:20 +090013546 if (sep == Py_None)
13547 return rsplit(self, NULL, maxsplit);
13548 if (PyUnicode_Check(sep))
13549 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013550
Victor Stinner998b8062018-09-12 00:23:25 +020013551 PyErr_Format(PyExc_TypeError,
13552 "must be str or None, not %.100s",
13553 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013554 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013555}
13556
INADA Naoki3ae20562017-01-16 20:41:20 +090013557/*[clinic input]
13558str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013559
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013560 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013561
13562Return a list of the lines in the string, breaking at line boundaries.
13563
13564Line breaks are not included in the resulting list unless keepends is given and
13565true.
13566[clinic start generated code]*/
13567
13568static PyObject *
13569unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013570/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013571{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013572 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013573}
13574
13575static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013576PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013578 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013579}
13580
INADA Naoki3ae20562017-01-16 20:41:20 +090013581/*[clinic input]
13582str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583
INADA Naoki3ae20562017-01-16 20:41:20 +090013584Convert uppercase characters to lowercase and lowercase characters to uppercase.
13585[clinic start generated code]*/
13586
13587static PyObject *
13588unicode_swapcase_impl(PyObject *self)
13589/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013590{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013591 if (PyUnicode_READY(self) == -1)
13592 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013593 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013594}
13595
Larry Hastings61272b72014-01-07 12:41:53 -080013596/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013597
Larry Hastings31826802013-10-19 00:09:25 -070013598@staticmethod
13599str.maketrans as unicode_maketrans
13600
13601 x: object
13602
13603 y: unicode=NULL
13604
13605 z: unicode=NULL
13606
13607 /
13608
13609Return a translation table usable for str.translate().
13610
13611If there is only one argument, it must be a dictionary mapping Unicode
13612ordinals (integers) or characters to Unicode ordinals, strings or None.
13613Character keys will be then converted to ordinals.
13614If there are two arguments, they must be strings of equal length, and
13615in the resulting dictionary, each character in x will be mapped to the
13616character at the same position in y. If there is a third argument, it
13617must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013618[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013619
Larry Hastings31826802013-10-19 00:09:25 -070013620static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013621unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013622/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013623{
Georg Brandlceee0772007-11-27 23:48:05 +000013624 PyObject *new = NULL, *key, *value;
13625 Py_ssize_t i = 0;
13626 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013627
Georg Brandlceee0772007-11-27 23:48:05 +000013628 new = PyDict_New();
13629 if (!new)
13630 return NULL;
13631 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013632 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013633 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634
Georg Brandlceee0772007-11-27 23:48:05 +000013635 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013636 if (!PyUnicode_Check(x)) {
13637 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13638 "be a string if there is a second argument");
13639 goto err;
13640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013642 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13643 "arguments must have equal length");
13644 goto err;
13645 }
13646 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647 x_kind = PyUnicode_KIND(x);
13648 y_kind = PyUnicode_KIND(y);
13649 x_data = PyUnicode_DATA(x);
13650 y_data = PyUnicode_DATA(y);
13651 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13652 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013653 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013654 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013655 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013656 if (!value) {
13657 Py_DECREF(key);
13658 goto err;
13659 }
Georg Brandlceee0772007-11-27 23:48:05 +000013660 res = PyDict_SetItem(new, key, value);
13661 Py_DECREF(key);
13662 Py_DECREF(value);
13663 if (res < 0)
13664 goto err;
13665 }
13666 /* create entries for deleting chars in z */
13667 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013668 z_kind = PyUnicode_KIND(z);
13669 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013670 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013671 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013672 if (!key)
13673 goto err;
13674 res = PyDict_SetItem(new, key, Py_None);
13675 Py_DECREF(key);
13676 if (res < 0)
13677 goto err;
13678 }
13679 }
13680 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013681 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013682 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013683
Georg Brandlceee0772007-11-27 23:48:05 +000013684 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013685 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013686 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13687 "to maketrans it must be a dict");
13688 goto err;
13689 }
13690 /* copy entries into the new dict, converting string keys to int keys */
13691 while (PyDict_Next(x, &i, &key, &value)) {
13692 if (PyUnicode_Check(key)) {
13693 /* convert string keys to integer keys */
13694 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013695 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013696 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13697 "table must be of length 1");
13698 goto err;
13699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013700 kind = PyUnicode_KIND(key);
13701 data = PyUnicode_DATA(key);
13702 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013703 if (!newkey)
13704 goto err;
13705 res = PyDict_SetItem(new, newkey, value);
13706 Py_DECREF(newkey);
13707 if (res < 0)
13708 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013709 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013710 /* just keep integer keys */
13711 if (PyDict_SetItem(new, key, value) < 0)
13712 goto err;
13713 } else {
13714 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13715 "be strings or integers");
13716 goto err;
13717 }
13718 }
13719 }
13720 return new;
13721 err:
13722 Py_DECREF(new);
13723 return NULL;
13724}
13725
INADA Naoki3ae20562017-01-16 20:41:20 +090013726/*[clinic input]
13727str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013728
INADA Naoki3ae20562017-01-16 20:41:20 +090013729 table: object
13730 Translation table, which must be a mapping of Unicode ordinals to
13731 Unicode ordinals, strings, or None.
13732 /
13733
13734Replace each character in the string using the given translation table.
13735
13736The table must implement lookup/indexing via __getitem__, for instance a
13737dictionary or list. If this operation raises LookupError, the character is
13738left untouched. Characters mapped to None are deleted.
13739[clinic start generated code]*/
13740
13741static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013742unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013743/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013746}
13747
INADA Naoki3ae20562017-01-16 20:41:20 +090013748/*[clinic input]
13749str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013750
INADA Naoki3ae20562017-01-16 20:41:20 +090013751Return a copy of the string converted to uppercase.
13752[clinic start generated code]*/
13753
13754static PyObject *
13755unicode_upper_impl(PyObject *self)
13756/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013758 if (PyUnicode_READY(self) == -1)
13759 return NULL;
13760 if (PyUnicode_IS_ASCII(self))
13761 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013762 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763}
13764
INADA Naoki3ae20562017-01-16 20:41:20 +090013765/*[clinic input]
13766str.zfill as unicode_zfill
13767
13768 width: Py_ssize_t
13769 /
13770
13771Pad a numeric string with zeros on the left, to fill a field of the given width.
13772
13773The string is never truncated.
13774[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013775
13776static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013777unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013778/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013780 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013781 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013782 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013783 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013784 Py_UCS4 chr;
13785
Benjamin Petersonbac79492012-01-14 13:34:47 -050013786 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013788
Victor Stinnerc4b49542011-12-11 22:44:26 +010013789 if (PyUnicode_GET_LENGTH(self) >= width)
13790 return unicode_result_unchanged(self);
13791
13792 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013793
13794 u = pad(self, fill, 0, '0');
13795
Walter Dörwald068325e2002-04-15 13:36:47 +000013796 if (u == NULL)
13797 return NULL;
13798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013799 kind = PyUnicode_KIND(u);
13800 data = PyUnicode_DATA(u);
13801 chr = PyUnicode_READ(kind, data, fill);
13802
13803 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805 PyUnicode_WRITE(kind, data, 0, chr);
13806 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807 }
13808
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013809 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013810 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013812
13813#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013814static PyObject *
13815unicode__decimal2ascii(PyObject *self)
13816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013817 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013818}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819#endif
13820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013821PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013824Return True if S starts with the specified prefix, False otherwise.\n\
13825With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013826With optional end, stop comparing S at that position.\n\
13827prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013828
13829static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013830unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013833 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013834 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013835 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013836 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013837 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838
Jesus Ceaac451502011-04-20 17:09:23 +020013839 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013841 if (PyTuple_Check(subobj)) {
13842 Py_ssize_t i;
13843 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013844 substring = PyTuple_GET_ITEM(subobj, i);
13845 if (!PyUnicode_Check(substring)) {
13846 PyErr_Format(PyExc_TypeError,
13847 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013848 "not %.100s",
13849 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013850 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013851 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013852 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013853 if (result == -1)
13854 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013855 if (result) {
13856 Py_RETURN_TRUE;
13857 }
13858 }
13859 /* nothing matched */
13860 Py_RETURN_FALSE;
13861 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013862 if (!PyUnicode_Check(subobj)) {
13863 PyErr_Format(PyExc_TypeError,
13864 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013865 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013867 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013868 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013869 if (result == -1)
13870 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013871 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872}
13873
13874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013875PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013877\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013878Return True if S ends with the specified suffix, False otherwise.\n\
13879With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013880With optional end, stop comparing S at that position.\n\
13881suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013882
13883static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013884unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013886{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013887 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013888 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013889 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013890 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013891 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013892
Jesus Ceaac451502011-04-20 17:09:23 +020013893 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013894 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013895 if (PyTuple_Check(subobj)) {
13896 Py_ssize_t i;
13897 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013898 substring = PyTuple_GET_ITEM(subobj, i);
13899 if (!PyUnicode_Check(substring)) {
13900 PyErr_Format(PyExc_TypeError,
13901 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013902 "not %.100s",
13903 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013904 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013905 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013906 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013907 if (result == -1)
13908 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013909 if (result) {
13910 Py_RETURN_TRUE;
13911 }
13912 }
13913 Py_RETURN_FALSE;
13914 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013915 if (!PyUnicode_Check(subobj)) {
13916 PyErr_Format(PyExc_TypeError,
13917 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013918 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013920 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013921 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013922 if (result == -1)
13923 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013924 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925}
13926
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013927static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013928_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013929{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013930 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13931 writer->data = PyUnicode_DATA(writer->buffer);
13932
13933 if (!writer->readonly) {
13934 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013935 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013936 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013937 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013938 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13939 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13940 writer->kind = PyUnicode_WCHAR_KIND;
13941 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13942
Victor Stinner8f674cc2013-04-17 23:02:17 +020013943 /* Copy-on-write mode: set buffer size to 0 so
13944 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13945 * next write. */
13946 writer->size = 0;
13947 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013948}
13949
Victor Stinnerd3f08822012-05-29 12:57:52 +020013950void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013951_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013952{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013953 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013954
13955 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013956 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013957
13958 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13959 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13960 writer->kind = PyUnicode_WCHAR_KIND;
13961 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013962}
13963
Inada Naoki770847a2019-06-24 12:30:24 +090013964// Initialize _PyUnicodeWriter with initial buffer
13965static inline void
13966_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13967{
13968 memset(writer, 0, sizeof(*writer));
13969 writer->buffer = buffer;
13970 _PyUnicodeWriter_Update(writer);
13971 writer->min_length = writer->size;
13972}
13973
Victor Stinnerd3f08822012-05-29 12:57:52 +020013974int
13975_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13976 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013977{
13978 Py_ssize_t newlen;
13979 PyObject *newbuffer;
13980
Victor Stinner2740e462016-09-06 16:58:36 -070013981 assert(maxchar <= MAX_UNICODE);
13982
Victor Stinnerca9381e2015-09-22 00:58:32 +020013983 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013984 assert((maxchar > writer->maxchar && length >= 0)
13985 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013986
Victor Stinner202fdca2012-05-07 12:47:02 +020013987 if (length > PY_SSIZE_T_MAX - writer->pos) {
13988 PyErr_NoMemory();
13989 return -1;
13990 }
13991 newlen = writer->pos + length;
13992
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013993 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013994
Victor Stinnerd3f08822012-05-29 12:57:52 +020013995 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013996 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013997 if (writer->overallocate
13998 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13999 /* overallocate to limit the number of realloc() */
14000 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014001 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014002 if (newlen < writer->min_length)
14003 newlen = writer->min_length;
14004
Victor Stinnerd3f08822012-05-29 12:57:52 +020014005 writer->buffer = PyUnicode_New(newlen, maxchar);
14006 if (writer->buffer == NULL)
14007 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014008 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014009 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014010 if (writer->overallocate
14011 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14012 /* overallocate to limit the number of realloc() */
14013 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014014 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014015 if (newlen < writer->min_length)
14016 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014017
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014018 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014019 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014020 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014021 newbuffer = PyUnicode_New(newlen, maxchar);
14022 if (newbuffer == NULL)
14023 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014024 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14025 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014026 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014027 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014028 }
14029 else {
14030 newbuffer = resize_compact(writer->buffer, newlen);
14031 if (newbuffer == NULL)
14032 return -1;
14033 }
14034 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014035 }
14036 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014037 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014038 newbuffer = PyUnicode_New(writer->size, maxchar);
14039 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014040 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014041 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14042 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014043 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014044 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014045 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014046 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014047
14048#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014049}
14050
Victor Stinnerca9381e2015-09-22 00:58:32 +020014051int
14052_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14053 enum PyUnicode_Kind kind)
14054{
14055 Py_UCS4 maxchar;
14056
14057 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14058 assert(writer->kind < kind);
14059
14060 switch (kind)
14061 {
14062 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14063 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14064 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14065 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014066 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014067 }
14068
14069 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14070}
14071
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014072static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014073_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014074{
Victor Stinner2740e462016-09-06 16:58:36 -070014075 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014076 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14077 return -1;
14078 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14079 writer->pos++;
14080 return 0;
14081}
14082
14083int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014084_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14085{
14086 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14087}
14088
14089int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014090_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14091{
14092 Py_UCS4 maxchar;
14093 Py_ssize_t len;
14094
14095 if (PyUnicode_READY(str) == -1)
14096 return -1;
14097 len = PyUnicode_GET_LENGTH(str);
14098 if (len == 0)
14099 return 0;
14100 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14101 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014102 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014103 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014104 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014105 Py_INCREF(str);
14106 writer->buffer = str;
14107 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014108 writer->pos += len;
14109 return 0;
14110 }
14111 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14112 return -1;
14113 }
14114 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14115 str, 0, len);
14116 writer->pos += len;
14117 return 0;
14118}
14119
Victor Stinnere215d962012-10-06 23:03:36 +020014120int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014121_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14122 Py_ssize_t start, Py_ssize_t end)
14123{
14124 Py_UCS4 maxchar;
14125 Py_ssize_t len;
14126
14127 if (PyUnicode_READY(str) == -1)
14128 return -1;
14129
14130 assert(0 <= start);
14131 assert(end <= PyUnicode_GET_LENGTH(str));
14132 assert(start <= end);
14133
14134 if (end == 0)
14135 return 0;
14136
14137 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14138 return _PyUnicodeWriter_WriteStr(writer, str);
14139
14140 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14141 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14142 else
14143 maxchar = writer->maxchar;
14144 len = end - start;
14145
14146 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14147 return -1;
14148
14149 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14150 str, start, len);
14151 writer->pos += len;
14152 return 0;
14153}
14154
14155int
Victor Stinner4a587072013-11-19 12:54:53 +010014156_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14157 const char *ascii, Py_ssize_t len)
14158{
14159 if (len == -1)
14160 len = strlen(ascii);
14161
Andy Lestere6be9b52020-02-11 20:28:35 -060014162 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014163
14164 if (writer->buffer == NULL && !writer->overallocate) {
14165 PyObject *str;
14166
14167 str = _PyUnicode_FromASCII(ascii, len);
14168 if (str == NULL)
14169 return -1;
14170
14171 writer->readonly = 1;
14172 writer->buffer = str;
14173 _PyUnicodeWriter_Update(writer);
14174 writer->pos += len;
14175 return 0;
14176 }
14177
14178 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14179 return -1;
14180
14181 switch (writer->kind)
14182 {
14183 case PyUnicode_1BYTE_KIND:
14184 {
14185 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14186 Py_UCS1 *data = writer->data;
14187
Christian Heimesf051e432016-09-13 20:22:02 +020014188 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014189 break;
14190 }
14191 case PyUnicode_2BYTE_KIND:
14192 {
14193 _PyUnicode_CONVERT_BYTES(
14194 Py_UCS1, Py_UCS2,
14195 ascii, ascii + len,
14196 (Py_UCS2 *)writer->data + writer->pos);
14197 break;
14198 }
14199 case PyUnicode_4BYTE_KIND:
14200 {
14201 _PyUnicode_CONVERT_BYTES(
14202 Py_UCS1, Py_UCS4,
14203 ascii, ascii + len,
14204 (Py_UCS4 *)writer->data + writer->pos);
14205 break;
14206 }
14207 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014208 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014209 }
14210
14211 writer->pos += len;
14212 return 0;
14213}
14214
14215int
14216_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14217 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014218{
14219 Py_UCS4 maxchar;
14220
Andy Lestere6be9b52020-02-11 20:28:35 -060014221 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014222 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14223 return -1;
14224 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14225 writer->pos += len;
14226 return 0;
14227}
14228
Victor Stinnerd3f08822012-05-29 12:57:52 +020014229PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014230_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014231{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014232 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014233
Victor Stinnerd3f08822012-05-29 12:57:52 +020014234 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014235 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014236 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014237 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014238
14239 str = writer->buffer;
14240 writer->buffer = NULL;
14241
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014242 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014243 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14244 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014245 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014246
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014247 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14248 PyObject *str2;
14249 str2 = resize_compact(str, writer->pos);
14250 if (str2 == NULL) {
14251 Py_DECREF(str);
14252 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014253 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014254 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014255 }
14256
Victor Stinner15a0bd32013-07-08 22:29:55 +020014257 assert(_PyUnicode_CheckConsistency(str, 1));
14258 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014259}
14260
Victor Stinnerd3f08822012-05-29 12:57:52 +020014261void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014262_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014263{
14264 Py_CLEAR(writer->buffer);
14265}
14266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014267#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014268
14269PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014270 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014271\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014272Return a formatted version of S, using substitutions from args and kwargs.\n\
14273The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014274
Eric Smith27bbca62010-11-04 17:06:58 +000014275PyDoc_STRVAR(format_map__doc__,
14276 "S.format_map(mapping) -> str\n\
14277\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014278Return a formatted version of S, using substitutions from mapping.\n\
14279The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014280
INADA Naoki3ae20562017-01-16 20:41:20 +090014281/*[clinic input]
14282str.__format__ as unicode___format__
14283
14284 format_spec: unicode
14285 /
14286
14287Return a formatted version of the string as described by format_spec.
14288[clinic start generated code]*/
14289
Eric Smith4a7d76d2008-05-30 18:10:19 +000014290static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014291unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014292/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014293{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014294 _PyUnicodeWriter writer;
14295 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014296
Victor Stinnerd3f08822012-05-29 12:57:52 +020014297 if (PyUnicode_READY(self) == -1)
14298 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014299 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014300 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14301 self, format_spec, 0,
14302 PyUnicode_GET_LENGTH(format_spec));
14303 if (ret == -1) {
14304 _PyUnicodeWriter_Dealloc(&writer);
14305 return NULL;
14306 }
14307 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014308}
14309
INADA Naoki3ae20562017-01-16 20:41:20 +090014310/*[clinic input]
14311str.__sizeof__ as unicode_sizeof
14312
14313Return the size of the string in memory, in bytes.
14314[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014315
14316static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014317unicode_sizeof_impl(PyObject *self)
14318/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014320 Py_ssize_t size;
14321
14322 /* If it's a compact object, account for base structure +
14323 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014324 if (PyUnicode_IS_COMPACT_ASCII(self))
14325 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14326 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014327 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014328 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014329 else {
14330 /* If it is a two-block object, account for base object, and
14331 for character block if present. */
14332 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014333 if (_PyUnicode_DATA_ANY(self))
14334 size += (PyUnicode_GET_LENGTH(self) + 1) *
14335 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014336 }
14337 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014338 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014339 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14340 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14341 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14342 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014343
14344 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014345}
14346
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014347static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014348unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014349{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014350 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014351 if (!copy)
14352 return NULL;
14353 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014354}
14355
Guido van Rossumd57fd912000-03-10 22:53:23 +000014356static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014357 UNICODE_ENCODE_METHODDEF
14358 UNICODE_REPLACE_METHODDEF
14359 UNICODE_SPLIT_METHODDEF
14360 UNICODE_RSPLIT_METHODDEF
14361 UNICODE_JOIN_METHODDEF
14362 UNICODE_CAPITALIZE_METHODDEF
14363 UNICODE_CASEFOLD_METHODDEF
14364 UNICODE_TITLE_METHODDEF
14365 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014366 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014367 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014368 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014369 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014370 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014371 UNICODE_LJUST_METHODDEF
14372 UNICODE_LOWER_METHODDEF
14373 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014374 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14375 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014376 UNICODE_RJUST_METHODDEF
14377 UNICODE_RSTRIP_METHODDEF
14378 UNICODE_RPARTITION_METHODDEF
14379 UNICODE_SPLITLINES_METHODDEF
14380 UNICODE_STRIP_METHODDEF
14381 UNICODE_SWAPCASE_METHODDEF
14382 UNICODE_TRANSLATE_METHODDEF
14383 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014384 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14385 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014386 UNICODE_REMOVEPREFIX_METHODDEF
14387 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014388 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014389 UNICODE_ISLOWER_METHODDEF
14390 UNICODE_ISUPPER_METHODDEF
14391 UNICODE_ISTITLE_METHODDEF
14392 UNICODE_ISSPACE_METHODDEF
14393 UNICODE_ISDECIMAL_METHODDEF
14394 UNICODE_ISDIGIT_METHODDEF
14395 UNICODE_ISNUMERIC_METHODDEF
14396 UNICODE_ISALPHA_METHODDEF
14397 UNICODE_ISALNUM_METHODDEF
14398 UNICODE_ISIDENTIFIER_METHODDEF
14399 UNICODE_ISPRINTABLE_METHODDEF
14400 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014401 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014402 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014403 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014404 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014405 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014406#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014407 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014408 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014409#endif
14410
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014411 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014412 {NULL, NULL}
14413};
14414
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014415static PyObject *
14416unicode_mod(PyObject *v, PyObject *w)
14417{
Brian Curtindfc80e32011-08-10 20:28:54 -050014418 if (!PyUnicode_Check(v))
14419 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014420 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014421}
14422
14423static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014424 0, /*nb_add*/
14425 0, /*nb_subtract*/
14426 0, /*nb_multiply*/
14427 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014428};
14429
Guido van Rossumd57fd912000-03-10 22:53:23 +000014430static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 (lenfunc) unicode_length, /* sq_length */
14432 PyUnicode_Concat, /* sq_concat */
14433 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14434 (ssizeargfunc) unicode_getitem, /* sq_item */
14435 0, /* sq_slice */
14436 0, /* sq_ass_item */
14437 0, /* sq_ass_slice */
14438 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014439};
14440
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014441static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014442unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014444 if (PyUnicode_READY(self) == -1)
14445 return NULL;
14446
Victor Stinnera15e2602020-04-08 02:01:56 +020014447 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014448 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014449 if (i == -1 && PyErr_Occurred())
14450 return NULL;
14451 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014452 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014453 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014454 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014455 Py_ssize_t start, stop, step, slicelength, i;
14456 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014457 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014458 const void *src_data;
14459 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014460 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014461 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014462
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014463 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014464 return NULL;
14465 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014466 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14467 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014468
14469 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014470 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014471 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014472 slicelength == PyUnicode_GET_LENGTH(self)) {
14473 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014474 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014475 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014476 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014477 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014478 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014479 src_kind = PyUnicode_KIND(self);
14480 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014481 if (!PyUnicode_IS_ASCII(self)) {
14482 kind_limit = kind_maxchar_limit(src_kind);
14483 max_char = 0;
14484 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14485 ch = PyUnicode_READ(src_kind, src_data, cur);
14486 if (ch > max_char) {
14487 max_char = ch;
14488 if (max_char >= kind_limit)
14489 break;
14490 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014491 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014492 }
Victor Stinner55c99112011-10-13 01:17:06 +020014493 else
14494 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014495 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014496 if (result == NULL)
14497 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014498 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014499 dest_data = PyUnicode_DATA(result);
14500
14501 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014502 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14503 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014504 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014505 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014506 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014507 } else {
14508 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14509 return NULL;
14510 }
14511}
14512
14513static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 (lenfunc)unicode_length, /* mp_length */
14515 (binaryfunc)unicode_subscript, /* mp_subscript */
14516 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014517};
14518
Guido van Rossumd57fd912000-03-10 22:53:23 +000014519
Guido van Rossumd57fd912000-03-10 22:53:23 +000014520/* Helpers for PyUnicode_Format() */
14521
Victor Stinnera47082312012-10-04 02:19:54 +020014522struct unicode_formatter_t {
14523 PyObject *args;
14524 int args_owned;
14525 Py_ssize_t arglen, argidx;
14526 PyObject *dict;
14527
14528 enum PyUnicode_Kind fmtkind;
14529 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014530 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014531 PyObject *fmtstr;
14532
14533 _PyUnicodeWriter writer;
14534};
14535
14536struct unicode_format_arg_t {
14537 Py_UCS4 ch;
14538 int flags;
14539 Py_ssize_t width;
14540 int prec;
14541 int sign;
14542};
14543
Guido van Rossumd57fd912000-03-10 22:53:23 +000014544static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014545unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014546{
Victor Stinnera47082312012-10-04 02:19:54 +020014547 Py_ssize_t argidx = ctx->argidx;
14548
14549 if (argidx < ctx->arglen) {
14550 ctx->argidx++;
14551 if (ctx->arglen < 0)
14552 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014553 else
Victor Stinnera47082312012-10-04 02:19:54 +020014554 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014555 }
14556 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014557 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014558 return NULL;
14559}
14560
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014561/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014562
Victor Stinnera47082312012-10-04 02:19:54 +020014563/* Format a float into the writer if the writer is not NULL, or into *p_output
14564 otherwise.
14565
14566 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014567static int
Victor Stinnera47082312012-10-04 02:19:54 +020014568formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14569 PyObject **p_output,
14570 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014571{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014572 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014573 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014574 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014575 int prec;
14576 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014577
Guido van Rossumd57fd912000-03-10 22:53:23 +000014578 x = PyFloat_AsDouble(v);
14579 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014580 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014581
Victor Stinnera47082312012-10-04 02:19:54 +020014582 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014583 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014584 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014585
Victor Stinnera47082312012-10-04 02:19:54 +020014586 if (arg->flags & F_ALT)
14587 dtoa_flags = Py_DTSF_ALT;
14588 else
14589 dtoa_flags = 0;
14590 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014591 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014592 return -1;
14593 len = strlen(p);
14594 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014595 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014596 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014597 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014598 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014599 }
14600 else
14601 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014602 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014603 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014604}
14605
Victor Stinnerd0880d52012-04-27 23:40:13 +020014606/* formatlong() emulates the format codes d, u, o, x and X, and
14607 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14608 * Python's regular ints.
14609 * Return value: a new PyUnicodeObject*, or NULL if error.
14610 * The output string is of the form
14611 * "-"? ("0x" | "0X")? digit+
14612 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14613 * set in flags. The case of hex digits will be correct,
14614 * There will be at least prec digits, zero-filled on the left if
14615 * necessary to get that many.
14616 * val object to be converted
14617 * flags bitmask of format flags; only F_ALT is looked at
14618 * prec minimum number of digits; 0-fill on left if needed
14619 * type a character in [duoxX]; u acts the same as d
14620 *
14621 * CAUTION: o, x and X conversions on regular ints can never
14622 * produce a '-' sign, but can for Python's unbounded ints.
14623 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014624PyObject *
14625_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014626{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014627 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014628 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014629 Py_ssize_t i;
14630 int sign; /* 1 if '-', else 0 */
14631 int len; /* number of characters */
14632 Py_ssize_t llen;
14633 int numdigits; /* len == numnondigits + numdigits */
14634 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014635
Victor Stinnerd0880d52012-04-27 23:40:13 +020014636 /* Avoid exceeding SSIZE_T_MAX */
14637 if (prec > INT_MAX-3) {
14638 PyErr_SetString(PyExc_OverflowError,
14639 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014640 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014641 }
14642
14643 assert(PyLong_Check(val));
14644
14645 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014646 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014647 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014648 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014649 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014650 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014651 /* int and int subclasses should print numerically when a numeric */
14652 /* format code is used (see issue18780) */
14653 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014654 break;
14655 case 'o':
14656 numnondigits = 2;
14657 result = PyNumber_ToBase(val, 8);
14658 break;
14659 case 'x':
14660 case 'X':
14661 numnondigits = 2;
14662 result = PyNumber_ToBase(val, 16);
14663 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014664 }
14665 if (!result)
14666 return NULL;
14667
14668 assert(unicode_modifiable(result));
14669 assert(PyUnicode_IS_READY(result));
14670 assert(PyUnicode_IS_ASCII(result));
14671
14672 /* To modify the string in-place, there can only be one reference. */
14673 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014674 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014675 PyErr_BadInternalCall();
14676 return NULL;
14677 }
14678 buf = PyUnicode_DATA(result);
14679 llen = PyUnicode_GET_LENGTH(result);
14680 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014681 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014682 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014683 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014684 return NULL;
14685 }
14686 len = (int)llen;
14687 sign = buf[0] == '-';
14688 numnondigits += sign;
14689 numdigits = len - numnondigits;
14690 assert(numdigits > 0);
14691
14692 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014693 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014694 (type == 'o' || type == 'x' || type == 'X'))) {
14695 assert(buf[sign] == '0');
14696 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14697 buf[sign+1] == 'o');
14698 numnondigits -= 2;
14699 buf += 2;
14700 len -= 2;
14701 if (sign)
14702 buf[0] = '-';
14703 assert(len == numnondigits + numdigits);
14704 assert(numdigits > 0);
14705 }
14706
14707 /* Fill with leading zeroes to meet minimum width. */
14708 if (prec > numdigits) {
14709 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14710 numnondigits + prec);
14711 char *b1;
14712 if (!r1) {
14713 Py_DECREF(result);
14714 return NULL;
14715 }
14716 b1 = PyBytes_AS_STRING(r1);
14717 for (i = 0; i < numnondigits; ++i)
14718 *b1++ = *buf++;
14719 for (i = 0; i < prec - numdigits; i++)
14720 *b1++ = '0';
14721 for (i = 0; i < numdigits; i++)
14722 *b1++ = *buf++;
14723 *b1 = '\0';
14724 Py_DECREF(result);
14725 result = r1;
14726 buf = PyBytes_AS_STRING(result);
14727 len = numnondigits + prec;
14728 }
14729
14730 /* Fix up case for hex conversions. */
14731 if (type == 'X') {
14732 /* Need to convert all lower case letters to upper case.
14733 and need to convert 0x to 0X (and -0x to -0X). */
14734 for (i = 0; i < len; i++)
14735 if (buf[i] >= 'a' && buf[i] <= 'x')
14736 buf[i] -= 'a'-'A';
14737 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014738 if (!PyUnicode_Check(result)
14739 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014740 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014741 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014742 Py_DECREF(result);
14743 result = unicode;
14744 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014745 else if (len != PyUnicode_GET_LENGTH(result)) {
14746 if (PyUnicode_Resize(&result, len) < 0)
14747 Py_CLEAR(result);
14748 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014749 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014750}
14751
Ethan Furmandf3ed242014-01-05 06:50:30 -080014752/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014753 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014754 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014755 * -1 and raise an exception on error */
14756static int
Victor Stinnera47082312012-10-04 02:19:54 +020014757mainformatlong(PyObject *v,
14758 struct unicode_format_arg_t *arg,
14759 PyObject **p_output,
14760 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014761{
14762 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014763 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014764
14765 if (!PyNumber_Check(v))
14766 goto wrongtype;
14767
Ethan Furman9ab74802014-03-21 06:38:46 -070014768 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014769 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014770 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014771 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014772 }
14773 else {
14774 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014775 }
14776 if (iobj == NULL ) {
14777 if (PyErr_ExceptionMatches(PyExc_TypeError))
14778 goto wrongtype;
14779 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014780 }
14781 assert(PyLong_Check(iobj));
14782 }
14783 else {
14784 iobj = v;
14785 Py_INCREF(iobj);
14786 }
14787
14788 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014789 && arg->width == -1 && arg->prec == -1
14790 && !(arg->flags & (F_SIGN | F_BLANK))
14791 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014792 {
14793 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014794 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014795 int base;
14796
Victor Stinnera47082312012-10-04 02:19:54 +020014797 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014798 {
14799 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014800 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014801 case 'd':
14802 case 'i':
14803 case 'u':
14804 base = 10;
14805 break;
14806 case 'o':
14807 base = 8;
14808 break;
14809 case 'x':
14810 case 'X':
14811 base = 16;
14812 break;
14813 }
14814
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014815 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14816 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014817 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014818 }
14819 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014820 return 1;
14821 }
14822
Ethan Furmanb95b5612015-01-23 20:05:18 -080014823 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014824 Py_DECREF(iobj);
14825 if (res == NULL)
14826 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014827 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014828 return 0;
14829
14830wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014831 switch(type)
14832 {
14833 case 'o':
14834 case 'x':
14835 case 'X':
14836 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014837 "%%%c format: an integer is required, "
14838 "not %.200s",
14839 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014840 break;
14841 default:
14842 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014843 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014844 "not %.200s",
14845 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014846 break;
14847 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014848 return -1;
14849}
14850
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014851static Py_UCS4
14852formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014853{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014854 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014855 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014856 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014857 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014858 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014859 goto onError;
14860 }
14861 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014862 int overflow;
14863 long x = PyLong_AsLongAndOverflow(v, &overflow);
14864 if (x == -1 && PyErr_Occurred()) {
14865 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014866 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014867 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014868 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014869 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014870
Victor Stinner8faf8212011-12-08 22:14:11 +010014871 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014872 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014873 PyErr_SetString(PyExc_OverflowError,
14874 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014875 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014876 }
14877
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014878 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014879 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014880
Benjamin Peterson29060642009-01-31 22:14:21 +000014881 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014882 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014883 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014884 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014885}
14886
Victor Stinnera47082312012-10-04 02:19:54 +020014887/* Parse options of an argument: flags, width, precision.
14888 Handle also "%(name)" syntax.
14889
14890 Return 0 if the argument has been formatted into arg->str.
14891 Return 1 if the argument has been written into ctx->writer,
14892 Raise an exception and return -1 on error. */
14893static int
14894unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14895 struct unicode_format_arg_t *arg)
14896{
14897#define FORMAT_READ(ctx) \
14898 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14899
14900 PyObject *v;
14901
Victor Stinnera47082312012-10-04 02:19:54 +020014902 if (arg->ch == '(') {
14903 /* Get argument value from a dictionary. Example: "%(name)s". */
14904 Py_ssize_t keystart;
14905 Py_ssize_t keylen;
14906 PyObject *key;
14907 int pcount = 1;
14908
14909 if (ctx->dict == NULL) {
14910 PyErr_SetString(PyExc_TypeError,
14911 "format requires a mapping");
14912 return -1;
14913 }
14914 ++ctx->fmtpos;
14915 --ctx->fmtcnt;
14916 keystart = ctx->fmtpos;
14917 /* Skip over balanced parentheses */
14918 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14919 arg->ch = FORMAT_READ(ctx);
14920 if (arg->ch == ')')
14921 --pcount;
14922 else if (arg->ch == '(')
14923 ++pcount;
14924 ctx->fmtpos++;
14925 }
14926 keylen = ctx->fmtpos - keystart - 1;
14927 if (ctx->fmtcnt < 0 || pcount > 0) {
14928 PyErr_SetString(PyExc_ValueError,
14929 "incomplete format key");
14930 return -1;
14931 }
14932 key = PyUnicode_Substring(ctx->fmtstr,
14933 keystart, keystart + keylen);
14934 if (key == NULL)
14935 return -1;
14936 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014937 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014938 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014939 }
14940 ctx->args = PyObject_GetItem(ctx->dict, key);
14941 Py_DECREF(key);
14942 if (ctx->args == NULL)
14943 return -1;
14944 ctx->args_owned = 1;
14945 ctx->arglen = -1;
14946 ctx->argidx = -2;
14947 }
14948
14949 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014950 while (--ctx->fmtcnt >= 0) {
14951 arg->ch = FORMAT_READ(ctx);
14952 ctx->fmtpos++;
14953 switch (arg->ch) {
14954 case '-': arg->flags |= F_LJUST; continue;
14955 case '+': arg->flags |= F_SIGN; continue;
14956 case ' ': arg->flags |= F_BLANK; continue;
14957 case '#': arg->flags |= F_ALT; continue;
14958 case '0': arg->flags |= F_ZERO; continue;
14959 }
14960 break;
14961 }
14962
14963 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014964 if (arg->ch == '*') {
14965 v = unicode_format_getnextarg(ctx);
14966 if (v == NULL)
14967 return -1;
14968 if (!PyLong_Check(v)) {
14969 PyErr_SetString(PyExc_TypeError,
14970 "* wants int");
14971 return -1;
14972 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014973 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014974 if (arg->width == -1 && PyErr_Occurred())
14975 return -1;
14976 if (arg->width < 0) {
14977 arg->flags |= F_LJUST;
14978 arg->width = -arg->width;
14979 }
14980 if (--ctx->fmtcnt >= 0) {
14981 arg->ch = FORMAT_READ(ctx);
14982 ctx->fmtpos++;
14983 }
14984 }
14985 else if (arg->ch >= '0' && arg->ch <= '9') {
14986 arg->width = arg->ch - '0';
14987 while (--ctx->fmtcnt >= 0) {
14988 arg->ch = FORMAT_READ(ctx);
14989 ctx->fmtpos++;
14990 if (arg->ch < '0' || arg->ch > '9')
14991 break;
14992 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14993 mixing signed and unsigned comparison. Since arg->ch is between
14994 '0' and '9', casting to int is safe. */
14995 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14996 PyErr_SetString(PyExc_ValueError,
14997 "width too big");
14998 return -1;
14999 }
15000 arg->width = arg->width*10 + (arg->ch - '0');
15001 }
15002 }
15003
15004 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015005 if (arg->ch == '.') {
15006 arg->prec = 0;
15007 if (--ctx->fmtcnt >= 0) {
15008 arg->ch = FORMAT_READ(ctx);
15009 ctx->fmtpos++;
15010 }
15011 if (arg->ch == '*') {
15012 v = unicode_format_getnextarg(ctx);
15013 if (v == NULL)
15014 return -1;
15015 if (!PyLong_Check(v)) {
15016 PyErr_SetString(PyExc_TypeError,
15017 "* wants int");
15018 return -1;
15019 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015020 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015021 if (arg->prec == -1 && PyErr_Occurred())
15022 return -1;
15023 if (arg->prec < 0)
15024 arg->prec = 0;
15025 if (--ctx->fmtcnt >= 0) {
15026 arg->ch = FORMAT_READ(ctx);
15027 ctx->fmtpos++;
15028 }
15029 }
15030 else if (arg->ch >= '0' && arg->ch <= '9') {
15031 arg->prec = arg->ch - '0';
15032 while (--ctx->fmtcnt >= 0) {
15033 arg->ch = FORMAT_READ(ctx);
15034 ctx->fmtpos++;
15035 if (arg->ch < '0' || arg->ch > '9')
15036 break;
15037 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15038 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015039 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015040 return -1;
15041 }
15042 arg->prec = arg->prec*10 + (arg->ch - '0');
15043 }
15044 }
15045 }
15046
15047 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15048 if (ctx->fmtcnt >= 0) {
15049 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15050 if (--ctx->fmtcnt >= 0) {
15051 arg->ch = FORMAT_READ(ctx);
15052 ctx->fmtpos++;
15053 }
15054 }
15055 }
15056 if (ctx->fmtcnt < 0) {
15057 PyErr_SetString(PyExc_ValueError,
15058 "incomplete format");
15059 return -1;
15060 }
15061 return 0;
15062
15063#undef FORMAT_READ
15064}
15065
15066/* Format one argument. Supported conversion specifiers:
15067
15068 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015069 - "i", "d", "u": int or float
15070 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015071 - "e", "E", "f", "F", "g", "G": float
15072 - "c": int or str (1 character)
15073
Victor Stinner8dbd4212012-12-04 09:30:24 +010015074 When possible, the output is written directly into the Unicode writer
15075 (ctx->writer). A string is created when padding is required.
15076
Victor Stinnera47082312012-10-04 02:19:54 +020015077 Return 0 if the argument has been formatted into *p_str,
15078 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015079 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015080static int
15081unicode_format_arg_format(struct unicode_formatter_t *ctx,
15082 struct unicode_format_arg_t *arg,
15083 PyObject **p_str)
15084{
15085 PyObject *v;
15086 _PyUnicodeWriter *writer = &ctx->writer;
15087
15088 if (ctx->fmtcnt == 0)
15089 ctx->writer.overallocate = 0;
15090
Victor Stinnera47082312012-10-04 02:19:54 +020015091 v = unicode_format_getnextarg(ctx);
15092 if (v == NULL)
15093 return -1;
15094
Victor Stinnera47082312012-10-04 02:19:54 +020015095
15096 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015097 case 's':
15098 case 'r':
15099 case 'a':
15100 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15101 /* Fast path */
15102 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15103 return -1;
15104 return 1;
15105 }
15106
15107 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15108 *p_str = v;
15109 Py_INCREF(*p_str);
15110 }
15111 else {
15112 if (arg->ch == 's')
15113 *p_str = PyObject_Str(v);
15114 else if (arg->ch == 'r')
15115 *p_str = PyObject_Repr(v);
15116 else
15117 *p_str = PyObject_ASCII(v);
15118 }
15119 break;
15120
15121 case 'i':
15122 case 'd':
15123 case 'u':
15124 case 'o':
15125 case 'x':
15126 case 'X':
15127 {
15128 int ret = mainformatlong(v, arg, p_str, writer);
15129 if (ret != 0)
15130 return ret;
15131 arg->sign = 1;
15132 break;
15133 }
15134
15135 case 'e':
15136 case 'E':
15137 case 'f':
15138 case 'F':
15139 case 'g':
15140 case 'G':
15141 if (arg->width == -1 && arg->prec == -1
15142 && !(arg->flags & (F_SIGN | F_BLANK)))
15143 {
15144 /* Fast path */
15145 if (formatfloat(v, arg, NULL, writer) == -1)
15146 return -1;
15147 return 1;
15148 }
15149
15150 arg->sign = 1;
15151 if (formatfloat(v, arg, p_str, NULL) == -1)
15152 return -1;
15153 break;
15154
15155 case 'c':
15156 {
15157 Py_UCS4 ch = formatchar(v);
15158 if (ch == (Py_UCS4) -1)
15159 return -1;
15160 if (arg->width == -1 && arg->prec == -1) {
15161 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015162 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015163 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015164 return 1;
15165 }
15166 *p_str = PyUnicode_FromOrdinal(ch);
15167 break;
15168 }
15169
15170 default:
15171 PyErr_Format(PyExc_ValueError,
15172 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015173 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015174 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15175 (int)arg->ch,
15176 ctx->fmtpos - 1);
15177 return -1;
15178 }
15179 if (*p_str == NULL)
15180 return -1;
15181 assert (PyUnicode_Check(*p_str));
15182 return 0;
15183}
15184
15185static int
15186unicode_format_arg_output(struct unicode_formatter_t *ctx,
15187 struct unicode_format_arg_t *arg,
15188 PyObject *str)
15189{
15190 Py_ssize_t len;
15191 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015192 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015193 Py_ssize_t pindex;
15194 Py_UCS4 signchar;
15195 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015196 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015197 Py_ssize_t sublen;
15198 _PyUnicodeWriter *writer = &ctx->writer;
15199 Py_UCS4 fill;
15200
15201 fill = ' ';
15202 if (arg->sign && arg->flags & F_ZERO)
15203 fill = '0';
15204
15205 if (PyUnicode_READY(str) == -1)
15206 return -1;
15207
15208 len = PyUnicode_GET_LENGTH(str);
15209 if ((arg->width == -1 || arg->width <= len)
15210 && (arg->prec == -1 || arg->prec >= len)
15211 && !(arg->flags & (F_SIGN | F_BLANK)))
15212 {
15213 /* Fast path */
15214 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15215 return -1;
15216 return 0;
15217 }
15218
15219 /* Truncate the string for "s", "r" and "a" formats
15220 if the precision is set */
15221 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15222 if (arg->prec >= 0 && len > arg->prec)
15223 len = arg->prec;
15224 }
15225
15226 /* Adjust sign and width */
15227 kind = PyUnicode_KIND(str);
15228 pbuf = PyUnicode_DATA(str);
15229 pindex = 0;
15230 signchar = '\0';
15231 if (arg->sign) {
15232 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15233 if (ch == '-' || ch == '+') {
15234 signchar = ch;
15235 len--;
15236 pindex++;
15237 }
15238 else if (arg->flags & F_SIGN)
15239 signchar = '+';
15240 else if (arg->flags & F_BLANK)
15241 signchar = ' ';
15242 else
15243 arg->sign = 0;
15244 }
15245 if (arg->width < len)
15246 arg->width = len;
15247
15248 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015249 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015250 if (!(arg->flags & F_LJUST)) {
15251 if (arg->sign) {
15252 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015253 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015254 }
15255 else {
15256 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015257 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015258 }
15259 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015260 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15261 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015262 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015263 }
15264
Victor Stinnera47082312012-10-04 02:19:54 +020015265 buflen = arg->width;
15266 if (arg->sign && len == arg->width)
15267 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015268 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015269 return -1;
15270
15271 /* Write the sign if needed */
15272 if (arg->sign) {
15273 if (fill != ' ') {
15274 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15275 writer->pos += 1;
15276 }
15277 if (arg->width > len)
15278 arg->width--;
15279 }
15280
15281 /* Write the numeric prefix for "x", "X" and "o" formats
15282 if the alternate form is used.
15283 For example, write "0x" for the "%#x" format. */
15284 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15285 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15286 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15287 if (fill != ' ') {
15288 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15289 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15290 writer->pos += 2;
15291 pindex += 2;
15292 }
15293 arg->width -= 2;
15294 if (arg->width < 0)
15295 arg->width = 0;
15296 len -= 2;
15297 }
15298
15299 /* Pad left with the fill character if needed */
15300 if (arg->width > len && !(arg->flags & F_LJUST)) {
15301 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015302 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015303 writer->pos += sublen;
15304 arg->width = len;
15305 }
15306
15307 /* If padding with spaces: write sign if needed and/or numeric prefix if
15308 the alternate form is used */
15309 if (fill == ' ') {
15310 if (arg->sign) {
15311 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15312 writer->pos += 1;
15313 }
15314 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15315 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15316 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15317 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15318 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15319 writer->pos += 2;
15320 pindex += 2;
15321 }
15322 }
15323
15324 /* Write characters */
15325 if (len) {
15326 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15327 str, pindex, len);
15328 writer->pos += len;
15329 }
15330
15331 /* Pad right with the fill character if needed */
15332 if (arg->width > len) {
15333 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015334 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015335 writer->pos += sublen;
15336 }
15337 return 0;
15338}
15339
15340/* Helper of PyUnicode_Format(): format one arg.
15341 Return 0 on success, raise an exception and return -1 on error. */
15342static int
15343unicode_format_arg(struct unicode_formatter_t *ctx)
15344{
15345 struct unicode_format_arg_t arg;
15346 PyObject *str;
15347 int ret;
15348
Victor Stinner8dbd4212012-12-04 09:30:24 +010015349 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015350 if (arg.ch == '%') {
15351 ctx->fmtpos++;
15352 ctx->fmtcnt--;
15353 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15354 return -1;
15355 return 0;
15356 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015357 arg.flags = 0;
15358 arg.width = -1;
15359 arg.prec = -1;
15360 arg.sign = 0;
15361 str = NULL;
15362
Victor Stinnera47082312012-10-04 02:19:54 +020015363 ret = unicode_format_arg_parse(ctx, &arg);
15364 if (ret == -1)
15365 return -1;
15366
15367 ret = unicode_format_arg_format(ctx, &arg, &str);
15368 if (ret == -1)
15369 return -1;
15370
15371 if (ret != 1) {
15372 ret = unicode_format_arg_output(ctx, &arg, str);
15373 Py_DECREF(str);
15374 if (ret == -1)
15375 return -1;
15376 }
15377
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015378 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015379 PyErr_SetString(PyExc_TypeError,
15380 "not all arguments converted during string formatting");
15381 return -1;
15382 }
15383 return 0;
15384}
15385
Alexander Belopolsky40018472011-02-26 01:02:56 +000015386PyObject *
15387PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015388{
Victor Stinnera47082312012-10-04 02:19:54 +020015389 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015390
Guido van Rossumd57fd912000-03-10 22:53:23 +000015391 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015392 PyErr_BadInternalCall();
15393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015394 }
Victor Stinnera47082312012-10-04 02:19:54 +020015395
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015396 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015397 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015398
15399 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015400 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15401 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15402 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15403 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015404
Victor Stinner8f674cc2013-04-17 23:02:17 +020015405 _PyUnicodeWriter_Init(&ctx.writer);
15406 ctx.writer.min_length = ctx.fmtcnt + 100;
15407 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015408
Guido van Rossumd57fd912000-03-10 22:53:23 +000015409 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015410 ctx.arglen = PyTuple_Size(args);
15411 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015412 }
15413 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015414 ctx.arglen = -1;
15415 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015416 }
Victor Stinnera47082312012-10-04 02:19:54 +020015417 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015418 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015419 ctx.dict = args;
15420 else
15421 ctx.dict = NULL;
15422 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015423
Victor Stinnera47082312012-10-04 02:19:54 +020015424 while (--ctx.fmtcnt >= 0) {
15425 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015426 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015427
15428 nonfmtpos = ctx.fmtpos++;
15429 while (ctx.fmtcnt >= 0 &&
15430 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15431 ctx.fmtpos++;
15432 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 }
Victor Stinnera47082312012-10-04 02:19:54 +020015434 if (ctx.fmtcnt < 0) {
15435 ctx.fmtpos--;
15436 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015437 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015438
Victor Stinnercfc4c132013-04-03 01:48:39 +020015439 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15440 nonfmtpos, ctx.fmtpos) < 0)
15441 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015442 }
15443 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015444 ctx.fmtpos++;
15445 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015446 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015447 }
15448 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015449
Victor Stinnera47082312012-10-04 02:19:54 +020015450 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015451 PyErr_SetString(PyExc_TypeError,
15452 "not all arguments converted during string formatting");
15453 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015454 }
15455
Victor Stinnera47082312012-10-04 02:19:54 +020015456 if (ctx.args_owned) {
15457 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015458 }
Victor Stinnera47082312012-10-04 02:19:54 +020015459 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015460
Benjamin Peterson29060642009-01-31 22:14:21 +000015461 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015462 _PyUnicodeWriter_Dealloc(&ctx.writer);
15463 if (ctx.args_owned) {
15464 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015465 }
15466 return NULL;
15467}
15468
Jeremy Hylton938ace62002-07-17 16:30:39 +000015469static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015470unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15471
15472/*[clinic input]
15473@classmethod
15474str.__new__ as unicode_new
15475
15476 object as x: object = NULL
15477 encoding: str = NULL
15478 errors: str = NULL
15479
15480[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015481
Tim Peters6d6c1a32001-08-02 04:15:00 +000015482static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015483unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15484 const char *errors)
15485/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015486{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015487 PyObject *unicode;
15488 if (x == NULL) {
15489 unicode = unicode_new_empty();
15490 }
15491 else if (encoding == NULL && errors == NULL) {
15492 unicode = PyObject_Str(x);
15493 }
15494 else {
15495 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15496 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015497
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015498 if (unicode != NULL && type != &PyUnicode_Type) {
15499 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15500 }
15501 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015502}
15503
Guido van Rossume023fe02001-08-30 03:12:59 +000015504static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015505unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015506{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015507 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015508 Py_ssize_t length, char_size;
15509 int share_wstr, share_utf8;
15510 unsigned int kind;
15511 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015512
Benjamin Peterson14339b62009-01-31 16:36:08 +000015513 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015514 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015515 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015516 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015517 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015518
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015519 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015520 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015521 return NULL;
15522 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015523 kind = PyUnicode_KIND(unicode);
15524 length = PyUnicode_GET_LENGTH(unicode);
15525
15526 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015527#ifdef Py_DEBUG
15528 _PyUnicode_HASH(self) = -1;
15529#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015530 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015531#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015532 _PyUnicode_STATE(self).interned = 0;
15533 _PyUnicode_STATE(self).kind = kind;
15534 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015535 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015536 _PyUnicode_STATE(self).ready = 1;
15537 _PyUnicode_WSTR(self) = NULL;
15538 _PyUnicode_UTF8_LENGTH(self) = 0;
15539 _PyUnicode_UTF8(self) = NULL;
15540 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015541 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015542
15543 share_utf8 = 0;
15544 share_wstr = 0;
15545 if (kind == PyUnicode_1BYTE_KIND) {
15546 char_size = 1;
15547 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15548 share_utf8 = 1;
15549 }
15550 else if (kind == PyUnicode_2BYTE_KIND) {
15551 char_size = 2;
15552 if (sizeof(wchar_t) == 2)
15553 share_wstr = 1;
15554 }
15555 else {
15556 assert(kind == PyUnicode_4BYTE_KIND);
15557 char_size = 4;
15558 if (sizeof(wchar_t) == 4)
15559 share_wstr = 1;
15560 }
15561
15562 /* Ensure we won't overflow the length. */
15563 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15564 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015565 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015566 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015567 data = PyObject_MALLOC((length + 1) * char_size);
15568 if (data == NULL) {
15569 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015570 goto onError;
15571 }
15572
Victor Stinnerc3c74152011-10-02 20:39:55 +020015573 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015574 if (share_utf8) {
15575 _PyUnicode_UTF8_LENGTH(self) = length;
15576 _PyUnicode_UTF8(self) = data;
15577 }
15578 if (share_wstr) {
15579 _PyUnicode_WSTR_LENGTH(self) = length;
15580 _PyUnicode_WSTR(self) = (wchar_t *)data;
15581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015582
Christian Heimesf051e432016-09-13 20:22:02 +020015583 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015584 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015585 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015586#ifdef Py_DEBUG
15587 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15588#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015589 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015590
15591onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015592 Py_DECREF(self);
15593 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015594}
15595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015596PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015597"str(object='') -> str\n\
15598str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015599\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015600Create a new string object from the given object. If encoding or\n\
15601errors is specified, then the object must expose a data buffer\n\
15602that will be decoded using the given encoding and error handler.\n\
15603Otherwise, returns the result of object.__str__() (if defined)\n\
15604or repr(object).\n\
15605encoding defaults to sys.getdefaultencoding().\n\
15606errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015607
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015608static PyObject *unicode_iter(PyObject *seq);
15609
Guido van Rossumd57fd912000-03-10 22:53:23 +000015610PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015611 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015612 "str", /* tp_name */
15613 sizeof(PyUnicodeObject), /* tp_basicsize */
15614 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015615 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015616 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015617 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015618 0, /* tp_getattr */
15619 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015620 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015621 unicode_repr, /* tp_repr */
15622 &unicode_as_number, /* tp_as_number */
15623 &unicode_as_sequence, /* tp_as_sequence */
15624 &unicode_as_mapping, /* tp_as_mapping */
15625 (hashfunc) unicode_hash, /* tp_hash*/
15626 0, /* tp_call*/
15627 (reprfunc) unicode_str, /* tp_str */
15628 PyObject_GenericGetAttr, /* tp_getattro */
15629 0, /* tp_setattro */
15630 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015631 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015632 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15633 unicode_doc, /* tp_doc */
15634 0, /* tp_traverse */
15635 0, /* tp_clear */
15636 PyUnicode_RichCompare, /* tp_richcompare */
15637 0, /* tp_weaklistoffset */
15638 unicode_iter, /* tp_iter */
15639 0, /* tp_iternext */
15640 unicode_methods, /* tp_methods */
15641 0, /* tp_members */
15642 0, /* tp_getset */
15643 &PyBaseObject_Type, /* tp_base */
15644 0, /* tp_dict */
15645 0, /* tp_descr_get */
15646 0, /* tp_descr_set */
15647 0, /* tp_dictoffset */
15648 0, /* tp_init */
15649 0, /* tp_alloc */
15650 unicode_new, /* tp_new */
15651 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015652};
15653
15654/* Initialize the Unicode implementation */
15655
Victor Stinner331a6a52019-05-27 16:39:22 +020015656PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015657_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015658{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015659 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015660 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015661 0x000A, /* LINE FEED */
15662 0x000D, /* CARRIAGE RETURN */
15663 0x001C, /* FILE SEPARATOR */
15664 0x001D, /* GROUP SEPARATOR */
15665 0x001E, /* RECORD SEPARATOR */
15666 0x0085, /* NEXT LINE */
15667 0x2028, /* LINE SEPARATOR */
15668 0x2029, /* PARAGRAPH SEPARATOR */
15669 };
15670
Victor Stinner91698d82020-06-25 14:07:40 +020015671 struct _Py_unicode_state *state = &tstate->interp->unicode;
15672 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015673 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015674 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015675
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015676 if (_Py_IsMainInterpreter(tstate)) {
15677 /* initialize the linebreak bloom filter */
15678 bloom_linebreak = make_bloom_mask(
15679 PyUnicode_2BYTE_KIND, linebreak,
15680 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015681
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015682 if (PyType_Ready(&PyUnicode_Type) < 0) {
15683 return _PyStatus_ERR("Can't initialize unicode type");
15684 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015685
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015686 if (PyType_Ready(&EncodingMapType) < 0) {
15687 return _PyStatus_ERR("Can't initialize encoding map type");
15688 }
15689 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15690 return _PyStatus_ERR("Can't initialize field name iterator type");
15691 }
15692 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15693 return _PyStatus_ERR("Can't initialize formatter iter type");
15694 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015695 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015696 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015697}
15698
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015699
Walter Dörwald16807132007-05-25 13:52:07 +000015700void
15701PyUnicode_InternInPlace(PyObject **p)
15702{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015703 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015704#ifdef Py_DEBUG
15705 assert(s != NULL);
15706 assert(_PyUnicode_CHECK(s));
15707#else
Victor Stinner607b1022020-05-05 18:50:30 +020015708 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015709 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015710 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015711#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015712
Benjamin Peterson14339b62009-01-31 16:36:08 +000015713 /* If it's a subclass, we don't really know what putting
15714 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015715 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015716 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015717 }
15718
15719 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015720 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015721 }
15722
15723#ifdef INTERNED_STRINGS
Victor Stinner666ecfb2020-07-02 01:19:57 +020015724 if (PyUnicode_READY(s) == -1) {
15725 PyErr_Clear();
15726 return;
15727 }
15728
Benjamin Peterson14339b62009-01-31 16:36:08 +000015729 if (interned == NULL) {
15730 interned = PyDict_New();
15731 if (interned == NULL) {
15732 PyErr_Clear(); /* Don't leave an exception */
15733 return;
15734 }
15735 }
Victor Stinner607b1022020-05-05 18:50:30 +020015736
15737 PyObject *t;
Berker Peksagced8d4c2016-07-25 04:40:39 +030015738 t = PyDict_SetDefault(interned, s, s);
Victor Stinner607b1022020-05-05 18:50:30 +020015739
Berker Peksagced8d4c2016-07-25 04:40:39 +030015740 if (t == NULL) {
15741 PyErr_Clear();
15742 return;
15743 }
Victor Stinner607b1022020-05-05 18:50:30 +020015744
Berker Peksagced8d4c2016-07-25 04:40:39 +030015745 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015746 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015747 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015748 return;
15749 }
Victor Stinner607b1022020-05-05 18:50:30 +020015750
Victor Stinner3549ca32020-07-03 16:59:12 +020015751 /* The two references in interned dict (key and value) are not counted by
15752 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15753 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015754 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015755 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner7f413a52020-09-23 14:05:32 +020015756#else
15757 // PyDict expects that interned strings have their hash
15758 // (PyASCIIObject.hash) already computed.
15759 (void)unicode_hash(s);
Victor Stinner607b1022020-05-05 18:50:30 +020015760#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015761}
15762
15763void
15764PyUnicode_InternImmortal(PyObject **p)
15765{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015766 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15767 "PyUnicode_InternImmortal() is deprecated; "
15768 "use PyUnicode_InternInPlace() instead", 1) < 0)
15769 {
15770 // The function has no return value, the exception cannot
15771 // be reported to the caller, so just log it.
15772 PyErr_WriteUnraisable(NULL);
15773 }
15774
Benjamin Peterson14339b62009-01-31 16:36:08 +000015775 PyUnicode_InternInPlace(p);
15776 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015777 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015778 Py_INCREF(*p);
15779 }
Walter Dörwald16807132007-05-25 13:52:07 +000015780}
15781
15782PyObject *
15783PyUnicode_InternFromString(const char *cp)
15784{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015785 PyObject *s = PyUnicode_FromString(cp);
15786 if (s == NULL)
15787 return NULL;
15788 PyUnicode_InternInPlace(&s);
15789 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015790}
15791
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015792
Victor Stinner666ecfb2020-07-02 01:19:57 +020015793void
15794_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015795{
Victor Stinner666ecfb2020-07-02 01:19:57 +020015796 if (!_Py_IsMainInterpreter(tstate)) {
15797 // interned dict is shared by all interpreters
Benjamin Peterson14339b62009-01-31 16:36:08 +000015798 return;
15799 }
Walter Dörwald16807132007-05-25 13:52:07 +000015800
Victor Stinner666ecfb2020-07-02 01:19:57 +020015801 if (interned == NULL) {
15802 return;
15803 }
15804 assert(PyDict_CheckExact(interned));
15805
15806 PyObject *keys = PyDict_Keys(interned);
15807 if (keys == NULL) {
15808 PyErr_Clear();
15809 return;
15810 }
15811 assert(PyList_CheckExact(keys));
15812
15813 /* Interned unicode strings are not forcibly deallocated; rather, we give
15814 them their stolen references back, and then clear and DECREF the
15815 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015816
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015817 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015818#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015819 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015820
15821 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015822#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015823 for (Py_ssize_t i = 0; i < n; i++) {
15824 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015825 assert(PyUnicode_IS_READY(s));
15826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015827 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015828 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015829 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015830#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015831 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015832#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015833 break;
15834 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015835 // Restore the two references (key and value) ignored
15836 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015837 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015838#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015839 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015840#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015841 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015842 case SSTATE_NOT_INTERNED:
15843 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015844 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015845 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015847 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015848 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015849#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015850 fprintf(stderr,
15851 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15852 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015853#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015854 Py_DECREF(keys);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015855
Benjamin Peterson14339b62009-01-31 16:36:08 +000015856 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015857 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015858}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015859
15860
15861/********************* Unicode Iterator **************************/
15862
15863typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015864 PyObject_HEAD
15865 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015866 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015867} unicodeiterobject;
15868
15869static void
15870unicodeiter_dealloc(unicodeiterobject *it)
15871{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015872 _PyObject_GC_UNTRACK(it);
15873 Py_XDECREF(it->it_seq);
15874 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015875}
15876
15877static int
15878unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15879{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015880 Py_VISIT(it->it_seq);
15881 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015882}
15883
15884static PyObject *
15885unicodeiter_next(unicodeiterobject *it)
15886{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015887 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015888
Benjamin Peterson14339b62009-01-31 16:36:08 +000015889 assert(it != NULL);
15890 seq = it->it_seq;
15891 if (seq == NULL)
15892 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015893 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015895 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15896 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015897 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015898 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15899 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015900 if (item != NULL)
15901 ++it->it_index;
15902 return item;
15903 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015904
Benjamin Peterson14339b62009-01-31 16:36:08 +000015905 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015906 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015907 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015908}
15909
15910static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015911unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015912{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015913 Py_ssize_t len = 0;
15914 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015915 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015916 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015917}
15918
15919PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15920
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015921static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015922unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015923{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015924 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015925 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015926 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015927 it->it_seq, it->it_index);
15928 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015929 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015930 if (u == NULL)
15931 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015932 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015933 }
15934}
15935
15936PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15937
15938static PyObject *
15939unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15940{
15941 Py_ssize_t index = PyLong_AsSsize_t(state);
15942 if (index == -1 && PyErr_Occurred())
15943 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015944 if (it->it_seq != NULL) {
15945 if (index < 0)
15946 index = 0;
15947 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15948 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15949 it->it_index = index;
15950 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015951 Py_RETURN_NONE;
15952}
15953
15954PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15955
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015956static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015957 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015958 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015959 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15960 reduce_doc},
15961 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15962 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015963 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015964};
15965
15966PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015967 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15968 "str_iterator", /* tp_name */
15969 sizeof(unicodeiterobject), /* tp_basicsize */
15970 0, /* tp_itemsize */
15971 /* methods */
15972 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015973 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015974 0, /* tp_getattr */
15975 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015976 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015977 0, /* tp_repr */
15978 0, /* tp_as_number */
15979 0, /* tp_as_sequence */
15980 0, /* tp_as_mapping */
15981 0, /* tp_hash */
15982 0, /* tp_call */
15983 0, /* tp_str */
15984 PyObject_GenericGetAttr, /* tp_getattro */
15985 0, /* tp_setattro */
15986 0, /* tp_as_buffer */
15987 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15988 0, /* tp_doc */
15989 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15990 0, /* tp_clear */
15991 0, /* tp_richcompare */
15992 0, /* tp_weaklistoffset */
15993 PyObject_SelfIter, /* tp_iter */
15994 (iternextfunc)unicodeiter_next, /* tp_iternext */
15995 unicodeiter_methods, /* tp_methods */
15996 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015997};
15998
15999static PyObject *
16000unicode_iter(PyObject *seq)
16001{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016002 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016003
Benjamin Peterson14339b62009-01-31 16:36:08 +000016004 if (!PyUnicode_Check(seq)) {
16005 PyErr_BadInternalCall();
16006 return NULL;
16007 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016008 if (PyUnicode_READY(seq) == -1)
16009 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016010 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16011 if (it == NULL)
16012 return NULL;
16013 it->it_index = 0;
16014 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016015 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016016 _PyObject_GC_TRACK(it);
16017 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016018}
16019
Victor Stinner709d23d2019-05-02 14:56:30 -040016020static int
16021encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016022{
Victor Stinner709d23d2019-05-02 14:56:30 -040016023 int res;
16024 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16025 if (res == -2) {
16026 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16027 return -1;
16028 }
16029 if (res < 0) {
16030 PyErr_NoMemory();
16031 return -1;
16032 }
16033 return 0;
16034}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016035
Victor Stinner709d23d2019-05-02 14:56:30 -040016036
16037static int
16038config_get_codec_name(wchar_t **config_encoding)
16039{
16040 char *encoding;
16041 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16042 return -1;
16043 }
16044
16045 PyObject *name_obj = NULL;
16046 PyObject *codec = _PyCodec_Lookup(encoding);
16047 PyMem_RawFree(encoding);
16048
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016049 if (!codec)
16050 goto error;
16051
16052 name_obj = PyObject_GetAttrString(codec, "name");
16053 Py_CLEAR(codec);
16054 if (!name_obj) {
16055 goto error;
16056 }
16057
Victor Stinner709d23d2019-05-02 14:56:30 -040016058 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16059 Py_DECREF(name_obj);
16060 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016061 goto error;
16062 }
16063
Victor Stinner709d23d2019-05-02 14:56:30 -040016064 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16065 if (raw_wname == NULL) {
16066 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016067 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016068 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016069 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016070
16071 PyMem_RawFree(*config_encoding);
16072 *config_encoding = raw_wname;
16073
16074 PyMem_Free(wname);
16075 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016076
16077error:
16078 Py_XDECREF(codec);
16079 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016080 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016081}
16082
16083
Victor Stinner331a6a52019-05-27 16:39:22 +020016084static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016085init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016086{
Victor Stinner709d23d2019-05-02 14:56:30 -040016087 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016088 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016089 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016090 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016091 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016092 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016093 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016094}
16095
16096
Victor Stinner709d23d2019-05-02 14:56:30 -040016097static int
16098init_fs_codec(PyInterpreterState *interp)
16099{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016100 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016101
16102 _Py_error_handler error_handler;
16103 error_handler = get_error_handler_wide(config->filesystem_errors);
16104 if (error_handler == _Py_ERROR_UNKNOWN) {
16105 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16106 return -1;
16107 }
16108
16109 char *encoding, *errors;
16110 if (encode_wstr_utf8(config->filesystem_encoding,
16111 &encoding,
16112 "filesystem_encoding") < 0) {
16113 return -1;
16114 }
16115
16116 if (encode_wstr_utf8(config->filesystem_errors,
16117 &errors,
16118 "filesystem_errors") < 0) {
16119 PyMem_RawFree(encoding);
16120 return -1;
16121 }
16122
Victor Stinner3d17c042020-05-14 01:48:38 +020016123 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16124 PyMem_RawFree(fs_codec->encoding);
16125 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016126 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016127 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16128 PyMem_RawFree(fs_codec->errors);
16129 fs_codec->errors = errors;
16130 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016131
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016132#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016133 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016134#endif
16135
Victor Stinner709d23d2019-05-02 14:56:30 -040016136 /* At this point, PyUnicode_EncodeFSDefault() and
16137 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16138 the C implementation of the filesystem encoding. */
16139
16140 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16141 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016142 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16143 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016144 PyErr_NoMemory();
16145 return -1;
16146 }
16147 return 0;
16148}
16149
16150
Victor Stinner331a6a52019-05-27 16:39:22 +020016151static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016152init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016153{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016154 PyInterpreterState *interp = tstate->interp;
16155
Victor Stinner709d23d2019-05-02 14:56:30 -040016156 /* Update the filesystem encoding to the normalized Python codec name.
16157 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16158 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016159 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016160 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016161 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016162 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016163 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016164 }
16165
Victor Stinner709d23d2019-05-02 14:56:30 -040016166 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016167 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016168 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016169 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016170}
16171
16172
Victor Stinner331a6a52019-05-27 16:39:22 +020016173PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016174_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016175{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016176 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016177 if (_PyStatus_EXCEPTION(status)) {
16178 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016179 }
16180
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016181 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016182}
16183
16184
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016185static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016186_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016187{
Victor Stinner3d17c042020-05-14 01:48:38 +020016188 PyMem_RawFree(fs_codec->encoding);
16189 fs_codec->encoding = NULL;
16190 fs_codec->utf8 = 0;
16191 PyMem_RawFree(fs_codec->errors);
16192 fs_codec->errors = NULL;
16193 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016194}
16195
16196
Victor Stinner709d23d2019-05-02 14:56:30 -040016197#ifdef MS_WINDOWS
16198int
16199_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16200{
Victor Stinner81a7be32020-04-14 15:14:01 +020016201 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016202 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016203
16204 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16205 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16206 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16207 if (encoding == NULL || errors == NULL) {
16208 PyMem_RawFree(encoding);
16209 PyMem_RawFree(errors);
16210 PyErr_NoMemory();
16211 return -1;
16212 }
16213
16214 PyMem_RawFree(config->filesystem_encoding);
16215 config->filesystem_encoding = encoding;
16216 PyMem_RawFree(config->filesystem_errors);
16217 config->filesystem_errors = errors;
16218
16219 return init_fs_codec(interp);
16220}
16221#endif
16222
16223
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016224void
Victor Stinner3d483342019-11-22 12:27:50 +010016225_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016226{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016227 // _PyUnicode_ClearInterned() must be called before
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016228
Victor Stinner666ecfb2020-07-02 01:19:57 +020016229 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016230
Victor Stinner91698d82020-06-25 14:07:40 +020016231 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016232
Victor Stinner2f9ada92020-06-24 02:22:21 +020016233 for (Py_ssize_t i = 0; i < 256; i++) {
16234 Py_CLEAR(state->latin1[i]);
16235 }
16236
Victor Stinner666ecfb2020-07-02 01:19:57 +020016237 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016238 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016239 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016240
Victor Stinner3d17c042020-05-14 01:48:38 +020016241 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016242}
16243
16244
Georg Brandl66c221e2010-10-14 07:04:07 +000016245/* A _string module, to export formatter_parser and formatter_field_name_split
16246 to the string.Formatter class implemented in Python. */
16247
16248static PyMethodDef _string_methods[] = {
16249 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16250 METH_O, PyDoc_STR("split the argument as a field name")},
16251 {"formatter_parser", (PyCFunction) formatter_parser,
16252 METH_O, PyDoc_STR("parse the argument as a format string")},
16253 {NULL, NULL}
16254};
16255
16256static struct PyModuleDef _string_module = {
16257 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016258 .m_name = "_string",
16259 .m_doc = PyDoc_STR("string helper module"),
16260 .m_size = 0,
16261 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016262};
16263
16264PyMODINIT_FUNC
16265PyInit__string(void)
16266{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016267 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016268}
16269
16270
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016271#ifdef __cplusplus
16272}
16273#endif