blob: 82e09ad05fcd13eee8e1f6a233b2a4c67b27afdc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner91698d82020-06-25 14:07:40 +020044#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
Victor Stinnere5014be2020-04-14 17:52:15 +020046#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinner91698d82020-06-25 14:07:40 +020047#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
Victor Stinnere5014be2020-04-14 17:52:15 +020050#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner91698d82020-06-25 14:07:40 +020051#include "ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinner666ecfb2020-07-02 01:19:57 +020058/* Uncomment to display statistics on interned strings at exit
59 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010060/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 }
844 return str;
845}
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* --- Bloom Filters ----------------------------------------------------- */
848
849/* stuff to implement simple "bloom filters" for Unicode characters.
850 to keep things simple, we use a single bitmask, using the least 5
851 bits from each unicode characters as the bit index. */
852
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200853/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854
Antoine Pitrouf068f942010-01-13 14:19:12 +0000855#if LONG_BIT >= 128
856#define BLOOM_WIDTH 128
857#elif LONG_BIT >= 64
858#define BLOOM_WIDTH 64
859#elif LONG_BIT >= 32
860#define BLOOM_WIDTH 32
861#else
862#error "LONG_BIT is smaller than 32"
863#endif
864
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865#define BLOOM_MASK unsigned long
866
Serhiy Storchaka05997252013-01-26 12:14:02 +0200867static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870
Benjamin Peterson29060642009-01-31 22:14:21 +0000871#define BLOOM_LINEBREAK(ch) \
872 ((ch) < 128U ? ascii_linebreak[(ch)] : \
873 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700875static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300876make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877{
Victor Stinnera85af502013-04-09 21:53:54 +0200878#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
879 do { \
880 TYPE *data = (TYPE *)PTR; \
881 TYPE *end = data + LEN; \
882 Py_UCS4 ch; \
883 for (; data != end; data++) { \
884 ch = *data; \
885 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
886 } \
887 break; \
888 } while (0)
889
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 /* calculate simple bloom-style bitmask for a given unicode string */
891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
894 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200895 switch (kind) {
896 case PyUnicode_1BYTE_KIND:
897 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
898 break;
899 case PyUnicode_2BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
901 break;
902 case PyUnicode_4BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
904 break;
905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700906 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200909
910#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911}
912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300913static int
914ensure_unicode(PyObject *obj)
915{
916 if (!PyUnicode_Check(obj)) {
917 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200918 "must be str, not %.100s",
919 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920 return -1;
921 }
922 return PyUnicode_READY(obj);
923}
924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925/* Compilation of templated routines */
926
Victor Stinner90ed8a62020-06-24 00:34:07 +0200927#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/asciilib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs1lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
948#include "stringlib/ucs2lib.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/partition.h"
951#include "stringlib/split.h"
952#include "stringlib/count.h"
953#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300954#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200956#include "stringlib/undef.h"
957
958#include "stringlib/ucs4lib.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/partition.h"
961#include "stringlib/split.h"
962#include "stringlib/count.h"
963#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300964#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200965#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200966#include "stringlib/undef.h"
967
Inada Naoki2c4928d2020-06-17 20:09:44 +0900968_Py_COMP_DIAG_PUSH
969_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970#include "stringlib/unicodedefs.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/count.h"
973#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100974#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900975_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200977#undef STRINGLIB_GET_EMPTY
978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001227 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001228 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 }
1230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001231 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001232 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001233 return (PyUnicodeObject *)PyErr_NoMemory();
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 if (length < 0) {
1236 PyErr_SetString(PyExc_SystemError,
1237 "Negative size passed to _PyUnicode_New");
1238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1242 if (unicode == NULL)
1243 return NULL;
1244 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001245
1246 _PyUnicode_WSTR_LENGTH(unicode) = length;
1247 _PyUnicode_HASH(unicode) = -1;
1248 _PyUnicode_STATE(unicode).interned = 0;
1249 _PyUnicode_STATE(unicode).kind = 0;
1250 _PyUnicode_STATE(unicode).compact = 0;
1251 _PyUnicode_STATE(unicode).ready = 0;
1252 _PyUnicode_STATE(unicode).ascii = 0;
1253 _PyUnicode_DATA_ANY(unicode) = NULL;
1254 _PyUnicode_LENGTH(unicode) = 0;
1255 _PyUnicode_UTF8(unicode) = NULL;
1256 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1259 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001260 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001261 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001262 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264
Jeremy Hyltond8082792003-09-16 19:41:39 +00001265 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001266 * the caller fails before initializing str -- unicode_resize()
1267 * reads str[0], and the Keep-Alive optimization can keep memory
1268 * allocated for str alive across a call to unicode_dealloc(unicode).
1269 * We don't want unicode_resize to read uninitialized memory in
1270 * that case.
1271 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode)[0] = 0;
1273 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001274
Victor Stinner7931d9a2011-11-04 00:22:48 +01001275 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 return unicode;
1277}
1278
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279static const char*
1280unicode_kind_name(PyObject *unicode)
1281{
Victor Stinner42dfd712011-10-03 14:41:45 +02001282 /* don't check consistency: unicode_kind_name() is called from
1283 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 if (!PyUnicode_IS_COMPACT(unicode))
1285 {
1286 if (!PyUnicode_IS_READY(unicode))
1287 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 {
1290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "legacy ascii";
1293 else
1294 return "legacy latin1";
1295 case PyUnicode_2BYTE_KIND:
1296 return "legacy UCS2";
1297 case PyUnicode_4BYTE_KIND:
1298 return "legacy UCS4";
1299 default:
1300 return "<legacy invalid kind>";
1301 }
1302 }
1303 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001304 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001306 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 return "ascii";
1308 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 default:
1315 return "<invalid compact kind>";
1316 }
1317}
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001321const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001322 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return _PyUnicode_COMPACT_DATA(unicode);
1329}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001332 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1334 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1335 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1336 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1337 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1338 return PyUnicode_DATA(unicode);
1339}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001340
1341void
1342_PyUnicode_Dump(PyObject *op)
1343{
1344 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1346 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001347 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001350 {
1351 if (ascii->state.ascii)
1352 data = (ascii + 1);
1353 else
1354 data = (compact + 1);
1355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 else
1357 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001358 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->wstr == data)
1361 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001362 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera3b334d2011-10-03 13:53:37 +02001364 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001365 printf(" (%zu), ", compact->wstr_length);
1366 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 }
1369 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001370 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
1374
Victor Stinner91698d82020-06-25 14:07:40 +02001375static int
1376unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1377{
1378 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1379 // optimized to always use state->empty_string without having to check if
1380 // it is NULL or not.
1381 PyObject *empty = PyUnicode_New(1, 0);
1382 if (empty == NULL) {
1383 return -1;
1384 }
1385 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1386 _PyUnicode_LENGTH(empty) = 0;
1387 assert(_PyUnicode_CheckConsistency(empty, 1));
1388
1389 assert(state->empty_string == NULL);
1390 state->empty_string = empty;
1391 return 0;
1392}
1393
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyObject *
1396PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1397{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001398 /* Optimization for empty strings */
1399 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001400 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 }
1402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *obj;
1404 PyCompactUnicodeObject *unicode;
1405 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001406 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001407 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 Py_ssize_t char_size;
1409 Py_ssize_t struct_size;
1410
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 is_ascii = 0;
1412 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 struct_size = sizeof(PyCompactUnicodeObject);
1414 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001415 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 char_size = 1;
1417 is_ascii = 1;
1418 struct_size = sizeof(PyASCIIObject);
1419 }
1420 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001421 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 char_size = 1;
1423 }
1424 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 2;
1427 if (sizeof(wchar_t) == 2)
1428 is_sharing = 1;
1429 }
1430 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001431 if (maxchar > MAX_UNICODE) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "invalid maximum character passed to PyUnicode_New");
1434 return NULL;
1435 }
Victor Stinner8f825062012-04-27 13:55:39 +02001436 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 char_size = 4;
1438 if (sizeof(wchar_t) == 4)
1439 is_sharing = 1;
1440 }
1441
1442 /* Ensure we won't overflow the size. */
1443 if (size < 0) {
1444 PyErr_SetString(PyExc_SystemError,
1445 "Negative size passed to PyUnicode_New");
1446 return NULL;
1447 }
1448 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1449 return PyErr_NoMemory();
1450
1451 /* Duplicated allocation code from _PyObject_New() instead of a call to
1452 * PyObject_New() so we are able to allocate space for the object and
1453 * it's data buffer.
1454 */
1455 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001456 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001458 }
1459 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 unicode = (PyCompactUnicodeObject *)obj;
1462 if (is_ascii)
1463 data = ((PyASCIIObject*)obj) + 1;
1464 else
1465 data = unicode + 1;
1466 _PyUnicode_LENGTH(unicode) = size;
1467 _PyUnicode_HASH(unicode) = -1;
1468 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 _PyUnicode_STATE(unicode).compact = 1;
1471 _PyUnicode_STATE(unicode).ready = 1;
1472 _PyUnicode_STATE(unicode).ascii = is_ascii;
1473 if (is_ascii) {
1474 ((char*)data)[size] = 0;
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 }
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001482 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else {
1485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001489 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 ((Py_UCS4*)data)[size] = 0;
1491 if (is_sharing) {
1492 _PyUnicode_WSTR_LENGTH(unicode) = size;
1493 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1494 }
1495 else {
1496 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 }
1499 }
Victor Stinner8f825062012-04-27 13:55:39 +02001500#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001501 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001502#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return obj;
1505}
1506
1507#if SIZEOF_WCHAR_T == 2
1508/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1509 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001510 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 This function assumes that unicode can hold one more code point than wstr
1513 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001514static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
1518 const wchar_t *iter;
1519 Py_UCS4 *ucs4_out;
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 assert(unicode != NULL);
1522 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1524 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1525
1526 for (iter = begin; iter < end; ) {
1527 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1528 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001529 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1530 && (iter+1) < end
1531 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 {
Victor Stinner551ac952011-11-29 22:58:13 +01001533 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 iter += 2;
1535 }
1536 else {
1537 *ucs4_out++ = *iter;
1538 iter++;
1539 }
1540 }
1541 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
1543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544}
1545#endif
1546
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547static int
Victor Stinner488fa492011-12-12 00:01:39 +01001548unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001549{
Victor Stinner488fa492011-12-12 00:01:39 +01001550 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001551 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001552 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553 return -1;
1554 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001555 return 0;
1556}
1557
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558static int
1559_copy_characters(PyObject *to, Py_ssize_t to_start,
1560 PyObject *from, Py_ssize_t from_start,
1561 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001563 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001564 const void *from_data;
1565 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinneree4544c2012-05-09 22:24:08 +02001567 assert(0 <= how_many);
1568 assert(0 <= from_start);
1569 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerd3f08822012-05-29 12:57:52 +02001574 assert(PyUnicode_Check(to));
1575 assert(PyUnicode_IS_READY(to));
1576 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1577
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 if (how_many == 0)
1579 return 0;
1580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585
Victor Stinnerf1852262012-06-16 16:38:26 +02001586#ifdef Py_DEBUG
1587 if (!check_maxchar
1588 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1589 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 Py_UCS4 ch;
1592 Py_ssize_t i;
1593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1595 assert(ch <= to_maxchar);
1596 }
1597 }
1598#endif
1599
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001600 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (check_maxchar
1602 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1603 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 /* Writing Latin-1 characters into an ASCII string requires to
1605 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 Py_UCS4 max_char;
1607 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001608 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 if (max_char >= 128)
1610 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001611 }
Christian Heimesf051e432016-09-13 20:22:02 +02001612 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001614 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001616 else if (from_kind == PyUnicode_1BYTE_KIND
1617 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS1, Py_UCS2,
1621 PyUnicode_1BYTE_DATA(from) + from_start,
1622 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_2BYTE_DATA(to) + to_start
1624 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001625 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001626 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 && to_kind == PyUnicode_4BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS1, Py_UCS4,
1631 PyUnicode_1BYTE_DATA(from) + from_start,
1632 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_4BYTE_DATA(to) + to_start
1634 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001635 }
1636 else if (from_kind == PyUnicode_2BYTE_KIND
1637 && to_kind == PyUnicode_4BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS2, Py_UCS4,
1641 PyUnicode_2BYTE_DATA(from) + from_start,
1642 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_4BYTE_DATA(to) + to_start
1644 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001645 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1648
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 if (!check_maxchar) {
1650 if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_1BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS1,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_1BYTE_DATA(to) + to_start
1658 );
1659 }
1660 else if (from_kind == PyUnicode_4BYTE_KIND
1661 && to_kind == PyUnicode_1BYTE_KIND)
1662 {
1663 _PyUnicode_CONVERT_BYTES(
1664 Py_UCS4, Py_UCS1,
1665 PyUnicode_4BYTE_DATA(from) + from_start,
1666 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1667 PyUnicode_1BYTE_DATA(to) + to_start
1668 );
1669 }
1670 else if (from_kind == PyUnicode_4BYTE_KIND
1671 && to_kind == PyUnicode_2BYTE_KIND)
1672 {
1673 _PyUnicode_CONVERT_BYTES(
1674 Py_UCS4, Py_UCS2,
1675 PyUnicode_4BYTE_DATA(from) + from_start,
1676 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1677 PyUnicode_2BYTE_DATA(to) + to_start
1678 );
1679 }
1680 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001681 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001682 }
1683 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001684 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 Py_ssize_t i;
1688
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 for (i=0; i < how_many; i++) {
1690 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001691 if (ch > to_maxchar)
1692 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1694 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001695 }
1696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 return 0;
1698}
1699
Victor Stinnerd3f08822012-05-29 12:57:52 +02001700void
1701_PyUnicode_FastCopyCharacters(
1702 PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704{
1705 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1706}
1707
1708Py_ssize_t
1709PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1710 PyObject *from, Py_ssize_t from_start,
1711 Py_ssize_t how_many)
1712{
1713 int err;
1714
1715 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1716 PyErr_BadInternalCall();
1717 return -1;
1718 }
1719
Benjamin Petersonbac79492012-01-14 13:34:47 -05001720 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001722 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001723 return -1;
1724
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001726 PyErr_SetString(PyExc_IndexError, "string index out of range");
1727 return -1;
1728 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if (how_many < 0) {
1734 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1735 return -1;
1736 }
1737 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1739 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001740 "Cannot write %zi characters at %zi "
1741 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 how_many, to_start, PyUnicode_GET_LENGTH(to));
1743 return -1;
1744 }
1745
1746 if (how_many == 0)
1747 return 0;
1748
Victor Stinner488fa492011-12-12 00:01:39 +01001749 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001750 return -1;
1751
1752 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1753 if (err) {
1754 PyErr_Format(PyExc_SystemError,
1755 "Cannot copy %s characters "
1756 "into a string of %s characters",
1757 unicode_kind_name(from),
1758 unicode_kind_name(to));
1759 return -1;
1760 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001761 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Victor Stinner17222162011-09-28 22:15:37 +02001764/* Find the maximum code point and count the number of surrogate pairs so a
1765 correct string length can be computed before converting a string to UCS4.
1766 This function counts single surrogates as a character and not as a pair.
1767
1768 Return 0 on success, or -1 on error. */
1769static int
1770find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1771 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772{
1773 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 *num_surrogates = 0;
1778 *maxchar = 0;
1779
1780 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001782 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1783 && (iter+1) < end
1784 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1785 {
1786 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1787 ++(*num_surrogates);
1788 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 {
1793 ch = *iter;
1794 iter++;
1795 }
1796 if (ch > *maxchar) {
1797 *maxchar = ch;
1798 if (*maxchar > MAX_UNICODE) {
1799 PyErr_Format(PyExc_ValueError,
1800 "character U+%x is not in range [U+0000; U+10ffff]",
1801 ch);
1802 return -1;
1803 }
1804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 return 0;
1807}
1808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809int
1810_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811{
1812 wchar_t *end;
1813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815#if SIZEOF_WCHAR_T == 2
1816 Py_ssize_t length_wo_surrogates;
1817#endif
1818
Georg Brandl7597add2011-10-05 16:36:47 +02001819 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 strings were created using _PyObject_New() and where no canonical
1821 representation (the str field) has been set yet aka strings
1822 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001823 assert(_PyUnicode_CHECK(unicode));
1824 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001827 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001828 /* Actually, it should neither be interned nor be anything else: */
1829 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001832 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
1836 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1838 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 PyErr_NoMemory();
1840 return -1;
1841 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001842 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 _PyUnicode_WSTR(unicode), end,
1844 PyUnicode_1BYTE_DATA(unicode));
1845 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1847 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1848 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001849 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001850 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001851 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 }
1853 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861 }
1862 /* In this case we might have to convert down from 4-byte native
1863 wchar_t to 2-byte unicode. */
1864 else if (maxchar < 65536) {
1865 assert(num_surrogates == 0 &&
1866 "FindMaxCharAndNumSurrogatePairs() messed up");
1867
Victor Stinner506f5922011-09-28 22:34:18 +02001868#if SIZEOF_WCHAR_T == 2
1869 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876#else
1877 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001878 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001879 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001881 PyErr_NoMemory();
1882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinner506f5922011-09-28 22:34:18 +02001884 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1885 _PyUnicode_WSTR(unicode), end,
1886 PyUnicode_2BYTE_DATA(unicode));
1887 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1888 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1889 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001890 _PyUnicode_UTF8(unicode) = NULL;
1891 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001892 PyObject_FREE(_PyUnicode_WSTR(unicode));
1893 _PyUnicode_WSTR(unicode) = NULL;
1894 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
1897 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1898 else {
1899#if SIZEOF_WCHAR_T == 2
1900 /* in case the native representation is 2-bytes, we need to allocate a
1901 new normalized 4-byte version. */
1902 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001903 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1904 PyErr_NoMemory();
1905 return -1;
1906 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1908 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 PyErr_NoMemory();
1910 return -1;
1911 }
1912 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001916 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001918 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 PyObject_FREE(_PyUnicode_WSTR(unicode));
1920 _PyUnicode_WSTR(unicode) = NULL;
1921 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1922#else
1923 assert(num_surrogates == 0);
1924
Victor Stinnerc3c74152011-10-02 20:39:55 +02001925 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001927 _PyUnicode_UTF8(unicode) = NULL;
1928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1930#endif
1931 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1932 }
1933 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return 0;
1936}
1937
Alexander Belopolsky40018472011-02-26 01:02:56 +00001938static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001939unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940{
Walter Dörwald16807132007-05-25 13:52:07 +00001941 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 case SSTATE_NOT_INTERNED:
1943 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_INTERNED_MORTAL:
Victor Stinner607b1022020-05-05 18:50:30 +02001946#ifdef INTERNED_STRINGS
Victor Stinner3549ca32020-07-03 16:59:12 +02001947 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1948 references (key and value) which were ignored by
1949 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1950 to prevent calling unicode_dealloc() again. Adjust refcnt after
1951 PyDict_DelItem(). */
1952 assert(Py_REFCNT(unicode) == 0);
1953 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001954 if (PyDict_DelItem(interned, unicode) != 0) {
1955 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1956 NULL);
1957 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001958 assert(Py_REFCNT(unicode) == 1);
1959 Py_SET_REFCNT(unicode, 0);
Victor Stinner607b1022020-05-05 18:50:30 +02001960#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001961 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001962
Benjamin Peterson29060642009-01-31 22:14:21 +00001963 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1965 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001966
Benjamin Peterson29060642009-01-31 22:14:21 +00001967 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001968 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001969 }
1970
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001971 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001973 }
1974 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001975 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001976 }
1977 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001978 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001981 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982}
1983
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001984#ifdef Py_DEBUG
1985static int
1986unicode_is_singleton(PyObject *unicode)
1987{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001988 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001989 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001991 }
Victor Stinner607b1022020-05-05 18:50:30 +02001992 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001993 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1994 {
1995 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001996 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001997 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001998 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001999 }
2000 return 0;
2001}
2002#endif
2003
Alexander Belopolsky40018472011-02-26 01:02:56 +00002004static int
Victor Stinner488fa492011-12-12 00:01:39 +01002005unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002006{
Victor Stinner488fa492011-12-12 00:01:39 +01002007 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 if (Py_REFCNT(unicode) != 1)
2009 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002010 if (_PyUnicode_HASH(unicode) != -1)
2011 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012 if (PyUnicode_CHECK_INTERNED(unicode))
2013 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002014 if (!PyUnicode_CheckExact(unicode))
2015 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002016#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002017 /* singleton refcount is greater than 1 */
2018 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002019#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002020 return 1;
2021}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002022
Victor Stinnerfe226c02011-10-03 03:52:20 +02002023static int
2024unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2025{
2026 PyObject *unicode;
2027 Py_ssize_t old_length;
2028
2029 assert(p_unicode != NULL);
2030 unicode = *p_unicode;
2031
2032 assert(unicode != NULL);
2033 assert(PyUnicode_Check(unicode));
2034 assert(0 <= length);
2035
Victor Stinner910337b2011-10-03 03:20:16 +02002036 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002037 old_length = PyUnicode_WSTR_LENGTH(unicode);
2038 else
2039 old_length = PyUnicode_GET_LENGTH(unicode);
2040 if (old_length == length)
2041 return 0;
2042
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002043 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002044 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002045 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002046 return 0;
2047 }
2048
Victor Stinner488fa492011-12-12 00:01:39 +01002049 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 PyObject *copy = resize_copy(unicode, length);
2051 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002052 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002053 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002054 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002055 }
2056
Victor Stinnerfe226c02011-10-03 03:52:20 +02002057 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002058 PyObject *new_unicode = resize_compact(unicode, length);
2059 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002060 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002061 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002062 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002063 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002064 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002068PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002069{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 PyObject *unicode;
2071 if (p_unicode == NULL) {
2072 PyErr_BadInternalCall();
2073 return -1;
2074 }
2075 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002076 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002077 {
2078 PyErr_BadInternalCall();
2079 return -1;
2080 }
2081 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002082}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002083
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002084/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002085
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002086 WARNING: The function doesn't copy the terminating null character and
2087 doesn't check the maximum character (may write a latin1 character in an
2088 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002089static void
2090unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2091 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002092{
2093 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002094 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002095 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002096
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002097 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002098 switch (kind) {
2099 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002100#ifdef Py_DEBUG
2101 if (PyUnicode_IS_ASCII(unicode)) {
2102 Py_UCS4 maxchar = ucs1lib_find_max_char(
2103 (const Py_UCS1*)str,
2104 (const Py_UCS1*)str + len);
2105 assert(maxchar < 128);
2106 }
2107#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002108 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002109 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002110 }
2111 case PyUnicode_2BYTE_KIND: {
2112 Py_UCS2 *start = (Py_UCS2 *)data + index;
2113 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002114
Victor Stinner184252a2012-06-16 02:57:41 +02002115 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002116 *ucs2 = (Py_UCS2)*str;
2117
2118 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002119 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002120 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002121 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002122 Py_UCS4 *start = (Py_UCS4 *)data + index;
2123 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002124
Victor Stinner184252a2012-06-16 02:57:41 +02002125 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 *ucs4 = (Py_UCS4)*str;
2127
2128 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002129 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002130 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002131 default:
2132 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002133 }
2134}
2135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002137get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002139 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002140
Victor Stinner2f9ada92020-06-24 02:22:21 +02002141 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002142 if (unicode) {
2143 Py_INCREF(unicode);
2144 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 }
Victor Stinner607b1022020-05-05 18:50:30 +02002146
2147 unicode = PyUnicode_New(1, ch);
2148 if (!unicode) {
2149 return NULL;
2150 }
2151
2152 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2153 assert(_PyUnicode_CheckConsistency(unicode, 1));
2154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002156 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002157 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158}
2159
Victor Stinner985a82a2014-01-03 12:53:47 +01002160static PyObject*
2161unicode_char(Py_UCS4 ch)
2162{
2163 PyObject *unicode;
2164
2165 assert(ch <= MAX_UNICODE);
2166
Victor Stinner2f9ada92020-06-24 02:22:21 +02002167 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002168 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002169 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002170
Victor Stinner985a82a2014-01-03 12:53:47 +01002171 unicode = PyUnicode_New(1, ch);
2172 if (unicode == NULL)
2173 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002174
2175 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2176 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002177 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002178 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002179 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2180 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2181 }
2182 assert(_PyUnicode_CheckConsistency(unicode, 1));
2183 return unicode;
2184}
2185
Alexander Belopolsky40018472011-02-26 01:02:56 +00002186PyObject *
2187PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002189 if (u == NULL) {
2190 if (size > 0) {
2191 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2192 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2193 "use PyUnicode_New() instead", 1) < 0) {
2194 return NULL;
2195 }
2196 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002197 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002198 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002199
2200 if (size < 0) {
2201 PyErr_BadInternalCall();
2202 return NULL;
2203 }
2204
2205 return PyUnicode_FromWideChar(u, size);
2206}
2207
2208PyObject *
2209PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2210{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002211 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 Py_UCS4 maxchar = 0;
2213 Py_ssize_t num_surrogates;
2214
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002215 if (u == NULL && size != 0) {
2216 PyErr_BadInternalCall();
2217 return NULL;
2218 }
2219
2220 if (size == -1) {
2221 size = wcslen(u);
2222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002224 /* If the Unicode data is known at construction time, we can apply
2225 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002228 if (size == 0)
2229 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 /* Single character Unicode objects in the Latin-1 range are
2232 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002233 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 return get_latin1_char((unsigned char)*u);
2235
2236 /* If not empty and not single character, copy the Unicode data
2237 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002238 if (find_maxchar_surrogates(u, u + size,
2239 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 return NULL;
2241
Victor Stinner8faf8212011-12-08 22:14:11 +01002242 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 if (!unicode)
2244 return NULL;
2245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 switch (PyUnicode_KIND(unicode)) {
2247 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002248 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2250 break;
2251 case PyUnicode_2BYTE_KIND:
2252#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002253 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002255 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2257#endif
2258 break;
2259 case PyUnicode_4BYTE_KIND:
2260#if SIZEOF_WCHAR_T == 2
2261 /* This is the only case which has to process surrogates, thus
2262 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002263 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264#else
2265 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002266 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267#endif
2268 break;
2269 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002270 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002272
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002273 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002274}
2275
Alexander Belopolsky40018472011-02-26 01:02:56 +00002276PyObject *
2277PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002278{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002279 if (size < 0) {
2280 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002281 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 return NULL;
2283 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002284 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002285 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002286 }
2287 else {
2288 if (size > 0) {
2289 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2290 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2291 "use PyUnicode_New() instead", 1) < 0) {
2292 return NULL;
2293 }
2294 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002295 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002296 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002297}
2298
Alexander Belopolsky40018472011-02-26 01:02:56 +00002299PyObject *
2300PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002301{
2302 size_t size = strlen(u);
2303 if (size > PY_SSIZE_T_MAX) {
2304 PyErr_SetString(PyExc_OverflowError, "input too long");
2305 return NULL;
2306 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002307 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308}
2309
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002310PyObject *
2311_PyUnicode_FromId(_Py_Identifier *id)
2312{
Victor Stinner297257f2020-06-02 14:39:45 +02002313 if (id->object) {
2314 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002315 }
Victor Stinner297257f2020-06-02 14:39:45 +02002316
2317 PyObject *obj;
2318 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2319 strlen(id->string),
2320 NULL, NULL);
2321 if (!obj) {
2322 return NULL;
2323 }
2324 PyUnicode_InternInPlace(&obj);
2325
2326 assert(!id->next);
2327 id->object = obj;
2328 id->next = static_strings;
2329 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002330 return id->object;
2331}
2332
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002333static void
2334unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002335{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002336 _Py_Identifier *tmp, *s = static_strings;
2337 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002338 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002339 tmp = s->next;
2340 s->next = NULL;
2341 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002342 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002343 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002344}
2345
Benjamin Peterson0df54292012-03-26 14:50:32 -04002346/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002347
Victor Stinnerd3f08822012-05-29 12:57:52 +02002348PyObject*
2349_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002350{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002351 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002352 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002353 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002354#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002355 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002356#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002357 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002358 }
Victor Stinner785938e2011-12-11 20:09:03 +01002359 unicode = PyUnicode_New(size, 127);
2360 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002361 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002362 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2363 assert(_PyUnicode_CheckConsistency(unicode, 1));
2364 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002365}
2366
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002367static Py_UCS4
2368kind_maxchar_limit(unsigned int kind)
2369{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002370 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002371 case PyUnicode_1BYTE_KIND:
2372 return 0x80;
2373 case PyUnicode_2BYTE_KIND:
2374 return 0x100;
2375 case PyUnicode_4BYTE_KIND:
2376 return 0x10000;
2377 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002378 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002379 }
2380}
2381
Victor Stinner702c7342011-10-05 13:50:52 +02002382static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002383_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002386 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002387
Victor Stinner2f9ada92020-06-24 02:22:21 +02002388 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002389 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002390 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002391 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002392 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002393 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002394 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002395
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002396 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 if (!res)
2399 return NULL;
2400 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002401 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002402 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002403}
2404
Victor Stinnere57b1c02011-09-28 22:20:48 +02002405static PyObject*
2406_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407{
2408 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002409 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002410
Serhiy Storchaka678db842013-01-26 12:16:36 +02002411 if (size == 0)
2412 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002413 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002414 if (size == 1)
2415 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002416
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002417 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002418 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002419 if (!res)
2420 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002421 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002422 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 else {
2424 _PyUnicode_CONVERT_BYTES(
2425 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2426 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002427 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 return res;
2429}
2430
Victor Stinnere57b1c02011-09-28 22:20:48 +02002431static PyObject*
2432_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433{
2434 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436
Serhiy Storchaka678db842013-01-26 12:16:36 +02002437 if (size == 0)
2438 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002439 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002440 if (size == 1)
2441 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002442
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002443 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002444 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002445 if (!res)
2446 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002447 if (max_char < 256)
2448 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2449 PyUnicode_1BYTE_DATA(res));
2450 else if (max_char < 0x10000)
2451 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2452 PyUnicode_2BYTE_DATA(res));
2453 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002455 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 return res;
2457}
2458
2459PyObject*
2460PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2461{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002462 if (size < 0) {
2463 PyErr_SetString(PyExc_ValueError, "size must be positive");
2464 return NULL;
2465 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002466 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002468 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002470 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002472 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002473 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002474 PyErr_SetString(PyExc_SystemError, "invalid kind");
2475 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477}
2478
Victor Stinnerece58de2012-04-23 23:36:38 +02002479Py_UCS4
2480_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2481{
2482 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002483 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002484
2485 assert(PyUnicode_IS_READY(unicode));
2486 assert(0 <= start);
2487 assert(end <= PyUnicode_GET_LENGTH(unicode));
2488 assert(start <= end);
2489
2490 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2491 return PyUnicode_MAX_CHAR_VALUE(unicode);
2492
2493 if (start == end)
2494 return 127;
2495
Victor Stinner94d558b2012-04-27 22:26:58 +02002496 if (PyUnicode_IS_ASCII(unicode))
2497 return 127;
2498
Victor Stinnerece58de2012-04-23 23:36:38 +02002499 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002500 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002501 endptr = (char *)startptr + end * kind;
2502 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002503 switch(kind) {
2504 case PyUnicode_1BYTE_KIND:
2505 return ucs1lib_find_max_char(startptr, endptr);
2506 case PyUnicode_2BYTE_KIND:
2507 return ucs2lib_find_max_char(startptr, endptr);
2508 case PyUnicode_4BYTE_KIND:
2509 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002510 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002511 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002512 }
2513}
2514
Victor Stinner25a4b292011-10-06 12:31:55 +02002515/* Ensure that a string uses the most efficient storage, if it is not the
2516 case: create a new string with of the right kind. Write NULL into *p_unicode
2517 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002518static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002519unicode_adjust_maxchar(PyObject **p_unicode)
2520{
2521 PyObject *unicode, *copy;
2522 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002523 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002524 unsigned int kind;
2525
2526 assert(p_unicode != NULL);
2527 unicode = *p_unicode;
2528 assert(PyUnicode_IS_READY(unicode));
2529 if (PyUnicode_IS_ASCII(unicode))
2530 return;
2531
2532 len = PyUnicode_GET_LENGTH(unicode);
2533 kind = PyUnicode_KIND(unicode);
2534 if (kind == PyUnicode_1BYTE_KIND) {
2535 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002536 max_char = ucs1lib_find_max_char(u, u + len);
2537 if (max_char >= 128)
2538 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002539 }
2540 else if (kind == PyUnicode_2BYTE_KIND) {
2541 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002542 max_char = ucs2lib_find_max_char(u, u + len);
2543 if (max_char >= 256)
2544 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002545 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002546 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002547 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002548 max_char = ucs4lib_find_max_char(u, u + len);
2549 if (max_char >= 0x10000)
2550 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002551 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002552 else
2553 Py_UNREACHABLE();
2554
Victor Stinner25a4b292011-10-06 12:31:55 +02002555 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002556 if (copy != NULL)
2557 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002558 Py_DECREF(unicode);
2559 *p_unicode = copy;
2560}
2561
Victor Stinner034f6cf2011-09-30 02:26:44 +02002562PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002563_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002564{
Victor Stinner87af4f22011-11-21 23:03:47 +01002565 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002566 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002567
Victor Stinner034f6cf2011-09-30 02:26:44 +02002568 if (!PyUnicode_Check(unicode)) {
2569 PyErr_BadInternalCall();
2570 return NULL;
2571 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002572 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002573 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002574
Victor Stinner87af4f22011-11-21 23:03:47 +01002575 length = PyUnicode_GET_LENGTH(unicode);
2576 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002577 if (!copy)
2578 return NULL;
2579 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2580
Christian Heimesf051e432016-09-13 20:22:02 +02002581 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002582 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002583 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002584 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002585}
2586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587
Victor Stinnerbc603d12011-10-02 01:00:40 +02002588/* Widen Unicode objects to larger buffers. Don't write terminating null
2589 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002591static void*
2592unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002594 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002595
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002596 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002597 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002598 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002599 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600 if (!result)
2601 return PyErr_NoMemory();
2602 assert(skind == PyUnicode_1BYTE_KIND);
2603 _PyUnicode_CONVERT_BYTES(
2604 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002605 (const Py_UCS1 *)data,
2606 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002607 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002609 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002610 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002611 if (!result)
2612 return PyErr_NoMemory();
2613 if (skind == PyUnicode_2BYTE_KIND) {
2614 _PyUnicode_CONVERT_BYTES(
2615 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002616 (const Py_UCS2 *)data,
2617 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002618 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002620 else {
2621 assert(skind == PyUnicode_1BYTE_KIND);
2622 _PyUnicode_CONVERT_BYTES(
2623 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002624 (const Py_UCS1 *)data,
2625 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002626 result);
2627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002629 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002630 Py_UNREACHABLE();
2631 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633}
2634
2635static Py_UCS4*
2636as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2637 int copy_null)
2638{
2639 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002640 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 Py_ssize_t len, targetlen;
2642 if (PyUnicode_READY(string) == -1)
2643 return NULL;
2644 kind = PyUnicode_KIND(string);
2645 data = PyUnicode_DATA(string);
2646 len = PyUnicode_GET_LENGTH(string);
2647 targetlen = len;
2648 if (copy_null)
2649 targetlen++;
2650 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002651 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 if (!target) {
2653 PyErr_NoMemory();
2654 return NULL;
2655 }
2656 }
2657 else {
2658 if (targetsize < targetlen) {
2659 PyErr_Format(PyExc_SystemError,
2660 "string is longer than the buffer");
2661 if (copy_null && 0 < targetsize)
2662 target[0] = 0;
2663 return NULL;
2664 }
2665 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002666 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002667 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002668 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002670 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002671 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002672 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2673 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002674 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002675 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002676 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002677 else {
2678 Py_UNREACHABLE();
2679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 if (copy_null)
2681 target[len] = 0;
2682 return target;
2683}
2684
2685Py_UCS4*
2686PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2687 int copy_null)
2688{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002689 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 PyErr_BadInternalCall();
2691 return NULL;
2692 }
2693 return as_ucs4(string, target, targetsize, copy_null);
2694}
2695
2696Py_UCS4*
2697PyUnicode_AsUCS4Copy(PyObject *string)
2698{
2699 return as_ucs4(string, NULL, 0, 1);
2700}
2701
Victor Stinner15a11362012-10-06 23:48:20 +02002702/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002703 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2704 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2705#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002706
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002707static int
2708unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2709 Py_ssize_t width, Py_ssize_t precision)
2710{
2711 Py_ssize_t length, fill, arglen;
2712 Py_UCS4 maxchar;
2713
2714 if (PyUnicode_READY(str) == -1)
2715 return -1;
2716
2717 length = PyUnicode_GET_LENGTH(str);
2718 if ((precision == -1 || precision >= length)
2719 && width <= length)
2720 return _PyUnicodeWriter_WriteStr(writer, str);
2721
2722 if (precision != -1)
2723 length = Py_MIN(precision, length);
2724
2725 arglen = Py_MAX(length, width);
2726 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2727 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2728 else
2729 maxchar = writer->maxchar;
2730
2731 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2732 return -1;
2733
2734 if (width > length) {
2735 fill = width - length;
2736 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2737 return -1;
2738 writer->pos += fill;
2739 }
2740
2741 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2742 str, 0, length);
2743 writer->pos += length;
2744 return 0;
2745}
2746
2747static int
Victor Stinner998b8062018-09-12 00:23:25 +02002748unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002749 Py_ssize_t width, Py_ssize_t precision)
2750{
2751 /* UTF-8 */
2752 Py_ssize_t length;
2753 PyObject *unicode;
2754 int res;
2755
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002756 if (precision == -1) {
2757 length = strlen(str);
2758 }
2759 else {
2760 length = 0;
2761 while (length < precision && str[length]) {
2762 length++;
2763 }
2764 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002765 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2766 if (unicode == NULL)
2767 return -1;
2768
2769 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2770 Py_DECREF(unicode);
2771 return res;
2772}
2773
Victor Stinner96865452011-03-01 23:44:09 +00002774static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002775unicode_fromformat_arg(_PyUnicodeWriter *writer,
2776 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002777{
Victor Stinnere215d962012-10-06 23:03:36 +02002778 const char *p;
2779 Py_ssize_t len;
2780 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002781 Py_ssize_t width;
2782 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002783 int longflag;
2784 int longlongflag;
2785 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002787
2788 p = f;
2789 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002790 zeropad = 0;
2791 if (*f == '0') {
2792 zeropad = 1;
2793 f++;
2794 }
Victor Stinner96865452011-03-01 23:44:09 +00002795
2796 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002797 width = -1;
2798 if (Py_ISDIGIT((unsigned)*f)) {
2799 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002800 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002801 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002802 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002803 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002805 return NULL;
2806 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002808 f++;
2809 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002810 }
2811 precision = -1;
2812 if (*f == '.') {
2813 f++;
2814 if (Py_ISDIGIT((unsigned)*f)) {
2815 precision = (*f - '0');
2816 f++;
2817 while (Py_ISDIGIT((unsigned)*f)) {
2818 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2819 PyErr_SetString(PyExc_ValueError,
2820 "precision too big");
2821 return NULL;
2822 }
2823 precision = (precision * 10) + (*f - '0');
2824 f++;
2825 }
2826 }
Victor Stinner96865452011-03-01 23:44:09 +00002827 if (*f == '%') {
2828 /* "%.3%s" => f points to "3" */
2829 f--;
2830 }
2831 }
2832 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002833 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002834 f--;
2835 }
Victor Stinner96865452011-03-01 23:44:09 +00002836
2837 /* Handle %ld, %lu, %lld and %llu. */
2838 longflag = 0;
2839 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002840 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002841 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002842 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002843 longflag = 1;
2844 ++f;
2845 }
Victor Stinner96865452011-03-01 23:44:09 +00002846 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002847 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002848 longlongflag = 1;
2849 f += 2;
2850 }
Victor Stinner96865452011-03-01 23:44:09 +00002851 }
2852 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002853 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002854 size_tflag = 1;
2855 ++f;
2856 }
Victor Stinnere215d962012-10-06 23:03:36 +02002857
2858 if (f[1] == '\0')
2859 writer->overallocate = 0;
2860
2861 switch (*f) {
2862 case 'c':
2863 {
2864 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002865 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002866 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002867 "character argument not in range(0x110000)");
2868 return NULL;
2869 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002870 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002871 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002872 break;
2873 }
2874
2875 case 'i':
2876 case 'd':
2877 case 'u':
2878 case 'x':
2879 {
2880 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002881 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002882 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002883
2884 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002885 if (longflag) {
2886 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2887 }
2888 else if (longlongflag) {
2889 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2890 }
2891 else if (size_tflag) {
2892 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2893 }
2894 else {
2895 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2896 }
Victor Stinnere215d962012-10-06 23:03:36 +02002897 }
2898 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002899 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002900 }
2901 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002902 if (longflag) {
2903 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2904 }
2905 else if (longlongflag) {
2906 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2907 }
2908 else if (size_tflag) {
2909 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2910 }
2911 else {
2912 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2913 }
Victor Stinnere215d962012-10-06 23:03:36 +02002914 }
2915 assert(len >= 0);
2916
Victor Stinnere215d962012-10-06 23:03:36 +02002917 if (precision < len)
2918 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002919
2920 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002921 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2922 return NULL;
2923
Victor Stinnere215d962012-10-06 23:03:36 +02002924 if (width > precision) {
2925 Py_UCS4 fillchar;
2926 fill = width - precision;
2927 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002928 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2929 return NULL;
2930 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002931 }
Victor Stinner15a11362012-10-06 23:48:20 +02002932 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002933 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002934 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2935 return NULL;
2936 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002937 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002938
Victor Stinner4a587072013-11-19 12:54:53 +01002939 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2940 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002941 break;
2942 }
2943
2944 case 'p':
2945 {
2946 char number[MAX_LONG_LONG_CHARS];
2947
2948 len = sprintf(number, "%p", va_arg(*vargs, void*));
2949 assert(len >= 0);
2950
2951 /* %p is ill-defined: ensure leading 0x. */
2952 if (number[1] == 'X')
2953 number[1] = 'x';
2954 else if (number[1] != 'x') {
2955 memmove(number + 2, number,
2956 strlen(number) + 1);
2957 number[0] = '0';
2958 number[1] = 'x';
2959 len += 2;
2960 }
2961
Victor Stinner4a587072013-11-19 12:54:53 +01002962 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002963 return NULL;
2964 break;
2965 }
2966
2967 case 's':
2968 {
2969 /* UTF-8 */
2970 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002971 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002972 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002973 break;
2974 }
2975
2976 case 'U':
2977 {
2978 PyObject *obj = va_arg(*vargs, PyObject *);
2979 assert(obj && _PyUnicode_CHECK(obj));
2980
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002981 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002982 return NULL;
2983 break;
2984 }
2985
2986 case 'V':
2987 {
2988 PyObject *obj = va_arg(*vargs, PyObject *);
2989 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002990 if (obj) {
2991 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002992 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002993 return NULL;
2994 }
2995 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002996 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002997 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002998 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002999 }
3000 break;
3001 }
3002
3003 case 'S':
3004 {
3005 PyObject *obj = va_arg(*vargs, PyObject *);
3006 PyObject *str;
3007 assert(obj);
3008 str = PyObject_Str(obj);
3009 if (!str)
3010 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003011 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003012 Py_DECREF(str);
3013 return NULL;
3014 }
3015 Py_DECREF(str);
3016 break;
3017 }
3018
3019 case 'R':
3020 {
3021 PyObject *obj = va_arg(*vargs, PyObject *);
3022 PyObject *repr;
3023 assert(obj);
3024 repr = PyObject_Repr(obj);
3025 if (!repr)
3026 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003027 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003028 Py_DECREF(repr);
3029 return NULL;
3030 }
3031 Py_DECREF(repr);
3032 break;
3033 }
3034
3035 case 'A':
3036 {
3037 PyObject *obj = va_arg(*vargs, PyObject *);
3038 PyObject *ascii;
3039 assert(obj);
3040 ascii = PyObject_ASCII(obj);
3041 if (!ascii)
3042 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003043 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003044 Py_DECREF(ascii);
3045 return NULL;
3046 }
3047 Py_DECREF(ascii);
3048 break;
3049 }
3050
3051 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003052 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003053 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003054 break;
3055
3056 default:
3057 /* if we stumble upon an unknown formatting code, copy the rest
3058 of the format string to the output string. (we cannot just
3059 skip the code, since there's no way to know what's in the
3060 argument list) */
3061 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003062 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003063 return NULL;
3064 f = p+len;
3065 return f;
3066 }
3067
3068 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003069 return f;
3070}
3071
Walter Dörwaldd2034312007-05-18 16:29:38 +00003072PyObject *
3073PyUnicode_FromFormatV(const char *format, va_list vargs)
3074{
Victor Stinnere215d962012-10-06 23:03:36 +02003075 va_list vargs2;
3076 const char *f;
3077 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003078
Victor Stinner8f674cc2013-04-17 23:02:17 +02003079 _PyUnicodeWriter_Init(&writer);
3080 writer.min_length = strlen(format) + 100;
3081 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003082
Benjamin Peterson0c212142016-09-20 20:39:33 -07003083 // Copy varags to be able to pass a reference to a subfunction.
3084 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003085
3086 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003087 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003088 f = unicode_fromformat_arg(&writer, f, &vargs2);
3089 if (f == NULL)
3090 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 const char *p;
3094 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003095
Victor Stinnere215d962012-10-06 23:03:36 +02003096 p = f;
3097 do
3098 {
3099 if ((unsigned char)*p > 127) {
3100 PyErr_Format(PyExc_ValueError,
3101 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3102 "string, got a non-ASCII byte: 0x%02x",
3103 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003104 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003105 }
3106 p++;
3107 }
3108 while (*p != '\0' && *p != '%');
3109 len = p - f;
3110
3111 if (*p == '\0')
3112 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003113
3114 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003115 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003116
3117 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003119 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003120 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003121 return _PyUnicodeWriter_Finish(&writer);
3122
3123 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003124 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003125 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003126 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127}
3128
Walter Dörwaldd2034312007-05-18 16:29:38 +00003129PyObject *
3130PyUnicode_FromFormat(const char *format, ...)
3131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003132 PyObject* ret;
3133 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003134
3135#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003137#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003138 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003139#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003140 ret = PyUnicode_FromFormatV(format, vargs);
3141 va_end(vargs);
3142 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003143}
3144
Serhiy Storchakac46db922018-10-23 22:58:24 +03003145static Py_ssize_t
3146unicode_get_widechar_size(PyObject *unicode)
3147{
3148 Py_ssize_t res;
3149
3150 assert(unicode != NULL);
3151 assert(_PyUnicode_CHECK(unicode));
3152
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003153#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003154 if (_PyUnicode_WSTR(unicode) != NULL) {
3155 return PyUnicode_WSTR_LENGTH(unicode);
3156 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003157#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003158 assert(PyUnicode_IS_READY(unicode));
3159
3160 res = _PyUnicode_LENGTH(unicode);
3161#if SIZEOF_WCHAR_T == 2
3162 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3163 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3164 const Py_UCS4 *end = s + res;
3165 for (; s < end; ++s) {
3166 if (*s > 0xFFFF) {
3167 ++res;
3168 }
3169 }
3170 }
3171#endif
3172 return res;
3173}
3174
3175static void
3176unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3177{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003178 assert(unicode != NULL);
3179 assert(_PyUnicode_CHECK(unicode));
3180
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003181#if USE_UNICODE_WCHAR_CACHE
3182 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003183 if (wstr != NULL) {
3184 memcpy(w, wstr, size * sizeof(wchar_t));
3185 return;
3186 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003187#else /* USE_UNICODE_WCHAR_CACHE */
3188 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3189 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3190 return;
3191 }
3192#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003193 assert(PyUnicode_IS_READY(unicode));
3194
3195 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3196 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3197 for (; size--; ++s, ++w) {
3198 *w = *s;
3199 }
3200 }
3201 else {
3202#if SIZEOF_WCHAR_T == 4
3203 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3204 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3205 for (; size--; ++s, ++w) {
3206 *w = *s;
3207 }
3208#else
3209 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3210 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3211 for (; size--; ++s, ++w) {
3212 Py_UCS4 ch = *s;
3213 if (ch > 0xFFFF) {
3214 assert(ch <= MAX_UNICODE);
3215 /* encode surrogate pair in this case */
3216 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3217 if (!size--)
3218 break;
3219 *w = Py_UNICODE_LOW_SURROGATE(ch);
3220 }
3221 else {
3222 *w = ch;
3223 }
3224 }
3225#endif
3226 }
3227}
3228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003229#ifdef HAVE_WCHAR_H
3230
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003231/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003232
Victor Stinnerd88d9832011-09-06 02:00:05 +02003233 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003234 character) required to convert the unicode object. Ignore size argument.
3235
Victor Stinnerd88d9832011-09-06 02:00:05 +02003236 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003237 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003238 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003239Py_ssize_t
3240PyUnicode_AsWideChar(PyObject *unicode,
3241 wchar_t *w,
3242 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003243{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003244 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003245
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003246 if (unicode == NULL) {
3247 PyErr_BadInternalCall();
3248 return -1;
3249 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003250 if (!PyUnicode_Check(unicode)) {
3251 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003252 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003253 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003254
3255 res = unicode_get_widechar_size(unicode);
3256 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003257 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003258 }
3259
3260 if (size > res) {
3261 size = res + 1;
3262 }
3263 else {
3264 res = size;
3265 }
3266 unicode_copy_as_widechar(unicode, w, size);
3267 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003268}
3269
Victor Stinner137c34c2010-09-29 10:25:54 +00003270wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003271PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003272 Py_ssize_t *size)
3273{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003274 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003275 Py_ssize_t buflen;
3276
3277 if (unicode == NULL) {
3278 PyErr_BadInternalCall();
3279 return NULL;
3280 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003281 if (!PyUnicode_Check(unicode)) {
3282 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003283 return NULL;
3284 }
3285
Serhiy Storchakac46db922018-10-23 22:58:24 +03003286 buflen = unicode_get_widechar_size(unicode);
3287 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003288 if (buffer == NULL) {
3289 PyErr_NoMemory();
3290 return NULL;
3291 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003292 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3293 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003294 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003295 }
3296 else if (wcslen(buffer) != (size_t)buflen) {
3297 PyMem_FREE(buffer);
3298 PyErr_SetString(PyExc_ValueError,
3299 "embedded null character");
3300 return NULL;
3301 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003302 return buffer;
3303}
3304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003305#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003307int
3308_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3309{
3310 wchar_t **p = (wchar_t **)ptr;
3311 if (obj == NULL) {
3312#if !USE_UNICODE_WCHAR_CACHE
3313 PyMem_Free(*p);
3314#endif /* USE_UNICODE_WCHAR_CACHE */
3315 *p = NULL;
3316 return 1;
3317 }
3318 if (PyUnicode_Check(obj)) {
3319#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003320 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3321 if (*p == NULL) {
3322 return 0;
3323 }
3324 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003325#else /* USE_UNICODE_WCHAR_CACHE */
3326 *p = PyUnicode_AsWideCharString(obj, NULL);
3327 if (*p == NULL) {
3328 return 0;
3329 }
3330 return Py_CLEANUP_SUPPORTED;
3331#endif /* USE_UNICODE_WCHAR_CACHE */
3332 }
3333 PyErr_Format(PyExc_TypeError,
3334 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003335 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003336 return 0;
3337}
3338
3339int
3340_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3341{
3342 wchar_t **p = (wchar_t **)ptr;
3343 if (obj == NULL) {
3344#if !USE_UNICODE_WCHAR_CACHE
3345 PyMem_Free(*p);
3346#endif /* USE_UNICODE_WCHAR_CACHE */
3347 *p = NULL;
3348 return 1;
3349 }
3350 if (obj == Py_None) {
3351 *p = NULL;
3352 return 1;
3353 }
3354 if (PyUnicode_Check(obj)) {
3355#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003356 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3357 if (*p == NULL) {
3358 return 0;
3359 }
3360 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003361#else /* USE_UNICODE_WCHAR_CACHE */
3362 *p = PyUnicode_AsWideCharString(obj, NULL);
3363 if (*p == NULL) {
3364 return 0;
3365 }
3366 return Py_CLEANUP_SUPPORTED;
3367#endif /* USE_UNICODE_WCHAR_CACHE */
3368 }
3369 PyErr_Format(PyExc_TypeError,
3370 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003371 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003372 return 0;
3373}
3374
Alexander Belopolsky40018472011-02-26 01:02:56 +00003375PyObject *
3376PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003377{
Victor Stinner8faf8212011-12-08 22:14:11 +01003378 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003379 PyErr_SetString(PyExc_ValueError,
3380 "chr() arg not in range(0x110000)");
3381 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003382 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003383
Victor Stinner985a82a2014-01-03 12:53:47 +01003384 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003385}
3386
Alexander Belopolsky40018472011-02-26 01:02:56 +00003387PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003388PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003390 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003392 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003393 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003394 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 Py_INCREF(obj);
3396 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003397 }
3398 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 /* For a Unicode subtype that's not a Unicode object,
3400 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003401 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003402 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003403 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003404 "Can't convert '%.100s' object to str implicitly",
3405 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003406 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003407}
3408
Alexander Belopolsky40018472011-02-26 01:02:56 +00003409PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003410PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003411 const char *encoding,
3412 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003413{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003414 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003415 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003416
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003418 PyErr_BadInternalCall();
3419 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003421
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003422 /* Decoding bytes objects is the most common case and should be fast */
3423 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003424 if (PyBytes_GET_SIZE(obj) == 0) {
3425 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3426 return NULL;
3427 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003428 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003429 }
3430 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003431 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3432 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003433 }
3434
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003435 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003436 PyErr_SetString(PyExc_TypeError,
3437 "decoding str is not supported");
3438 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003440
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003441 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3442 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3443 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003444 "decoding to str: need a bytes-like object, %.80s found",
3445 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003446 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003447 }
Tim Petersced69f82003-09-16 20:30:58 +00003448
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003449 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003450 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003451 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3452 return NULL;
3453 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003454 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003456
Serhiy Storchaka05997252013-01-26 12:14:02 +02003457 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003458 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003459 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460}
3461
Victor Stinnerebe17e02016-10-12 13:57:45 +02003462/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3463 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3464 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003465int
3466_Py_normalize_encoding(const char *encoding,
3467 char *lower,
3468 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003470 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003471 char *l;
3472 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003473 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003474
Victor Stinner942889a2016-09-05 15:40:10 -07003475 assert(encoding != NULL);
3476
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003477 e = encoding;
3478 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003479 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003480 punct = 0;
3481 while (1) {
3482 char c = *e;
3483 if (c == 0) {
3484 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003485 }
Victor Stinner942889a2016-09-05 15:40:10 -07003486
3487 if (Py_ISALNUM(c) || c == '.') {
3488 if (punct && l != lower) {
3489 if (l == l_end) {
3490 return 0;
3491 }
3492 *l++ = '_';
3493 }
3494 punct = 0;
3495
3496 if (l == l_end) {
3497 return 0;
3498 }
3499 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003500 }
3501 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003502 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003503 }
Victor Stinner942889a2016-09-05 15:40:10 -07003504
3505 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003506 }
3507 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003508 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003509}
3510
Alexander Belopolsky40018472011-02-26 01:02:56 +00003511PyObject *
3512PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003513 Py_ssize_t size,
3514 const char *encoding,
3515 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003516{
3517 PyObject *buffer = NULL, *unicode;
3518 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003519 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3520
Victor Stinner22eb6892019-06-26 00:51:05 +02003521 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3522 return NULL;
3523 }
3524
Victor Stinnered076ed2019-06-26 01:49:32 +02003525 if (size == 0) {
3526 _Py_RETURN_UNICODE_EMPTY();
3527 }
3528
Victor Stinner942889a2016-09-05 15:40:10 -07003529 if (encoding == NULL) {
3530 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3531 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003532
Fred Drakee4315f52000-05-09 19:53:39 +00003533 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003534 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3535 char *lower = buflower;
3536
3537 /* Fast paths */
3538 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3539 lower += 3;
3540 if (*lower == '_') {
3541 /* Match "utf8" and "utf_8" */
3542 lower++;
3543 }
3544
3545 if (lower[0] == '8' && lower[1] == 0) {
3546 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3547 }
3548 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3549 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3550 }
3551 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3552 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3553 }
3554 }
3555 else {
3556 if (strcmp(lower, "ascii") == 0
3557 || strcmp(lower, "us_ascii") == 0) {
3558 return PyUnicode_DecodeASCII(s, size, errors);
3559 }
Steve Dowercc16be82016-09-08 10:35:16 -07003560 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003561 else if (strcmp(lower, "mbcs") == 0) {
3562 return PyUnicode_DecodeMBCS(s, size, errors);
3563 }
3564 #endif
3565 else if (strcmp(lower, "latin1") == 0
3566 || strcmp(lower, "latin_1") == 0
3567 || strcmp(lower, "iso_8859_1") == 0
3568 || strcmp(lower, "iso8859_1") == 0) {
3569 return PyUnicode_DecodeLatin1(s, size, errors);
3570 }
3571 }
Victor Stinner37296e82010-06-10 13:36:23 +00003572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573
3574 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003575 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003576 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003577 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003578 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003579 if (buffer == NULL)
3580 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003581 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582 if (unicode == NULL)
3583 goto onError;
3584 if (!PyUnicode_Check(unicode)) {
3585 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003586 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003587 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003588 encoding,
3589 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 Py_DECREF(unicode);
3591 goto onError;
3592 }
3593 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003594 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003595
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 Py_XDECREF(buffer);
3598 return NULL;
3599}
3600
Alexander Belopolsky40018472011-02-26 01:02:56 +00003601PyObject *
3602PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003603 const char *encoding,
3604 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003605{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003606 if (!PyUnicode_Check(unicode)) {
3607 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003608 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003609 }
3610
Serhiy Storchaka00939072016-10-27 21:05:49 +03003611 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3612 "PyUnicode_AsDecodedObject() is deprecated; "
3613 "use PyCodec_Decode() to decode from str", 1) < 0)
3614 return NULL;
3615
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003616 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003618
3619 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003620 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003621}
3622
Alexander Belopolsky40018472011-02-26 01:02:56 +00003623PyObject *
3624PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003625 const char *encoding,
3626 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003627{
3628 PyObject *v;
3629
3630 if (!PyUnicode_Check(unicode)) {
3631 PyErr_BadArgument();
3632 goto onError;
3633 }
3634
Serhiy Storchaka00939072016-10-27 21:05:49 +03003635 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3636 "PyUnicode_AsDecodedUnicode() is deprecated; "
3637 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3638 return NULL;
3639
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003640 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003642
3643 /* Decode via the codec registry */
3644 v = PyCodec_Decode(unicode, encoding, errors);
3645 if (v == NULL)
3646 goto onError;
3647 if (!PyUnicode_Check(v)) {
3648 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003649 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003650 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003651 encoding,
3652 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003653 Py_DECREF(v);
3654 goto onError;
3655 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003656 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003657
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659 return NULL;
3660}
3661
Alexander Belopolsky40018472011-02-26 01:02:56 +00003662PyObject *
3663PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003664 Py_ssize_t size,
3665 const char *encoding,
3666 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667{
3668 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003669
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003670 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3674 Py_DECREF(unicode);
3675 return v;
3676}
3677
Alexander Belopolsky40018472011-02-26 01:02:56 +00003678PyObject *
3679PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003680 const char *encoding,
3681 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003682{
3683 PyObject *v;
3684
3685 if (!PyUnicode_Check(unicode)) {
3686 PyErr_BadArgument();
3687 goto onError;
3688 }
3689
Serhiy Storchaka00939072016-10-27 21:05:49 +03003690 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3691 "PyUnicode_AsEncodedObject() is deprecated; "
3692 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3693 "or PyCodec_Encode() for generic encoding", 1) < 0)
3694 return NULL;
3695
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003696 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003698
3699 /* Encode via the codec registry */
3700 v = PyCodec_Encode(unicode, encoding, errors);
3701 if (v == NULL)
3702 goto onError;
3703 return v;
3704
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003706 return NULL;
3707}
3708
Victor Stinner1b579672011-12-17 05:47:23 +01003709
Victor Stinner2cba6b82018-01-10 22:46:15 +01003710static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003711unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003712 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003713{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003714 Py_ssize_t wlen;
3715 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3716 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003717 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003719
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003720 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003721 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003722 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003723 return NULL;
3724 }
3725
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003726 char *str;
3727 size_t error_pos;
3728 const char *reason;
3729 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003730 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003731 PyMem_Free(wstr);
3732
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003733 if (res != 0) {
3734 if (res == -2) {
3735 PyObject *exc;
3736 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3737 "locale", unicode,
3738 (Py_ssize_t)error_pos,
3739 (Py_ssize_t)(error_pos+1),
3740 reason);
3741 if (exc != NULL) {
3742 PyCodec_StrictErrors(exc);
3743 Py_DECREF(exc);
3744 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003745 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003746 else if (res == -3) {
3747 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3748 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003749 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003750 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003751 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003752 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003753 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003754
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003755 PyObject *bytes = PyBytes_FromString(str);
3756 PyMem_RawFree(str);
3757 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003758}
3759
Victor Stinnerad158722010-10-27 00:25:46 +00003760PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003761PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3762{
Victor Stinner709d23d2019-05-02 14:56:30 -04003763 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3764 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003765}
3766
3767PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003768PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003769{
Victor Stinner81a7be32020-04-14 15:14:01 +02003770 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003771 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3772 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003773 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003774 fs_codec->error_handler,
3775 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003776 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003777#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003778 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003779 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003780 fs_codec->encoding,
3781 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003782 }
Victor Stinnerad158722010-10-27 00:25:46 +00003783#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003784 else {
3785 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3786 machinery is not ready and so cannot be used:
3787 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003788 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3789 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003790 assert(filesystem_errors != NULL);
3791 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3792 assert(errors != _Py_ERROR_UNKNOWN);
3793#ifdef _Py_FORCE_UTF8_FS_ENCODING
3794 return unicode_encode_utf8(unicode, errors, NULL);
3795#else
3796 return unicode_encode_locale(unicode, errors, 0);
3797#endif
3798 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003799}
3800
Alexander Belopolsky40018472011-02-26 01:02:56 +00003801PyObject *
3802PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003803 const char *encoding,
3804 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805{
3806 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003807 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003808
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 if (!PyUnicode_Check(unicode)) {
3810 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 }
Fred Drakee4315f52000-05-09 19:53:39 +00003813
Victor Stinner22eb6892019-06-26 00:51:05 +02003814 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3815 return NULL;
3816 }
3817
Victor Stinner942889a2016-09-05 15:40:10 -07003818 if (encoding == NULL) {
3819 return _PyUnicode_AsUTF8String(unicode, errors);
3820 }
3821
Fred Drakee4315f52000-05-09 19:53:39 +00003822 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003823 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3824 char *lower = buflower;
3825
3826 /* Fast paths */
3827 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3828 lower += 3;
3829 if (*lower == '_') {
3830 /* Match "utf8" and "utf_8" */
3831 lower++;
3832 }
3833
3834 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003836 }
3837 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3838 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3839 }
3840 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3841 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3842 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003843 }
Victor Stinner942889a2016-09-05 15:40:10 -07003844 else {
3845 if (strcmp(lower, "ascii") == 0
3846 || strcmp(lower, "us_ascii") == 0) {
3847 return _PyUnicode_AsASCIIString(unicode, errors);
3848 }
Steve Dowercc16be82016-09-08 10:35:16 -07003849#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003850 else if (strcmp(lower, "mbcs") == 0) {
3851 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3852 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003853#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003854 else if (strcmp(lower, "latin1") == 0 ||
3855 strcmp(lower, "latin_1") == 0 ||
3856 strcmp(lower, "iso_8859_1") == 0 ||
3857 strcmp(lower, "iso8859_1") == 0) {
3858 return _PyUnicode_AsLatin1String(unicode, errors);
3859 }
3860 }
Victor Stinner37296e82010-06-10 13:36:23 +00003861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862
3863 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003864 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003866 return NULL;
3867
3868 /* The normal path */
3869 if (PyBytes_Check(v))
3870 return v;
3871
3872 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003873 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003874 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003875 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003876
3877 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003878 "encoder %s returned bytearray instead of bytes; "
3879 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003880 encoding);
3881 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003882 Py_DECREF(v);
3883 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003884 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003885
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003886 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3887 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003888 Py_DECREF(v);
3889 return b;
3890 }
3891
3892 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003893 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003894 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003895 encoding,
3896 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003897 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003898 return NULL;
3899}
3900
Alexander Belopolsky40018472011-02-26 01:02:56 +00003901PyObject *
3902PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003903 const char *encoding,
3904 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003905{
3906 PyObject *v;
3907
3908 if (!PyUnicode_Check(unicode)) {
3909 PyErr_BadArgument();
3910 goto onError;
3911 }
3912
Serhiy Storchaka00939072016-10-27 21:05:49 +03003913 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3914 "PyUnicode_AsEncodedUnicode() is deprecated; "
3915 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3916 return NULL;
3917
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003918 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003920
3921 /* Encode via the codec registry */
3922 v = PyCodec_Encode(unicode, encoding, errors);
3923 if (v == NULL)
3924 goto onError;
3925 if (!PyUnicode_Check(v)) {
3926 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003927 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003928 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003929 encoding,
3930 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003931 Py_DECREF(v);
3932 goto onError;
3933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003935
Benjamin Peterson29060642009-01-31 22:14:21 +00003936 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 return NULL;
3938}
3939
Victor Stinner2cba6b82018-01-10 22:46:15 +01003940static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003941unicode_decode_locale(const char *str, Py_ssize_t len,
3942 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003943{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003944 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3945 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003946 return NULL;
3947 }
3948
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003949 wchar_t *wstr;
3950 size_t wlen;
3951 const char *reason;
3952 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003953 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003954 if (res != 0) {
3955 if (res == -2) {
3956 PyObject *exc;
3957 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3958 "locale", str, len,
3959 (Py_ssize_t)wlen,
3960 (Py_ssize_t)(wlen + 1),
3961 reason);
3962 if (exc != NULL) {
3963 PyCodec_StrictErrors(exc);
3964 Py_DECREF(exc);
3965 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003966 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003967 else if (res == -3) {
3968 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3969 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003970 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003971 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003972 }
Victor Stinner2f197072011-12-17 07:08:30 +01003973 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003974 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003975
3976 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3977 PyMem_RawFree(wstr);
3978 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003979}
3980
3981PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003982PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3983 const char *errors)
3984{
Victor Stinner709d23d2019-05-02 14:56:30 -04003985 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3986 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003987}
3988
3989PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003990PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003991{
3992 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003993 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3994 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003995}
3996
3997
3998PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003999PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004000 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004001 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4002}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004003
Christian Heimes5894ba72007-11-04 11:43:14 +00004004PyObject*
4005PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4006{
Victor Stinner81a7be32020-04-14 15:14:01 +02004007 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004008 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4009 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004010 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004011 fs_codec->error_handler,
4012 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004013 NULL);
4014 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004015#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004016 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004017 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004018 fs_codec->encoding,
4019 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004020 }
Victor Stinnerad158722010-10-27 00:25:46 +00004021#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004022 else {
4023 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4024 machinery is not ready and so cannot be used:
4025 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004026 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4027 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004028 assert(filesystem_errors != NULL);
4029 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4030 assert(errors != _Py_ERROR_UNKNOWN);
4031#ifdef _Py_FORCE_UTF8_FS_ENCODING
4032 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4033#else
4034 return unicode_decode_locale(s, size, errors, 0);
4035#endif
4036 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004037}
4038
Martin v. Löwis011e8422009-05-05 04:43:17 +00004039
4040int
4041PyUnicode_FSConverter(PyObject* arg, void* addr)
4042{
Brett Cannonec6ce872016-09-06 15:50:29 -07004043 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004044 PyObject *output = NULL;
4045 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004046 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004047 if (arg == NULL) {
4048 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004049 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004050 return 1;
4051 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004052 path = PyOS_FSPath(arg);
4053 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004054 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004055 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004056 if (PyBytes_Check(path)) {
4057 output = path;
4058 }
4059 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4060 output = PyUnicode_EncodeFSDefault(path);
4061 Py_DECREF(path);
4062 if (!output) {
4063 return 0;
4064 }
4065 assert(PyBytes_Check(output));
4066 }
4067
Victor Stinner0ea2a462010-04-30 00:22:08 +00004068 size = PyBytes_GET_SIZE(output);
4069 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004070 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004071 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004072 Py_DECREF(output);
4073 return 0;
4074 }
4075 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004076 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004077}
4078
4079
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004080int
4081PyUnicode_FSDecoder(PyObject* arg, void* addr)
4082{
Brett Cannona5711202016-09-06 19:36:01 -07004083 int is_buffer = 0;
4084 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004085 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004086 if (arg == NULL) {
4087 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004088 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004089 return 1;
4090 }
Brett Cannona5711202016-09-06 19:36:01 -07004091
4092 is_buffer = PyObject_CheckBuffer(arg);
4093 if (!is_buffer) {
4094 path = PyOS_FSPath(arg);
4095 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004096 return 0;
4097 }
Brett Cannona5711202016-09-06 19:36:01 -07004098 }
4099 else {
4100 path = arg;
4101 Py_INCREF(arg);
4102 }
4103
4104 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004105 output = path;
4106 }
4107 else if (PyBytes_Check(path) || is_buffer) {
4108 PyObject *path_bytes = NULL;
4109
4110 if (!PyBytes_Check(path) &&
4111 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004112 "path should be string, bytes, or os.PathLike, not %.200s",
4113 Py_TYPE(arg)->tp_name)) {
4114 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004115 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004116 }
4117 path_bytes = PyBytes_FromObject(path);
4118 Py_DECREF(path);
4119 if (!path_bytes) {
4120 return 0;
4121 }
4122 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4123 PyBytes_GET_SIZE(path_bytes));
4124 Py_DECREF(path_bytes);
4125 if (!output) {
4126 return 0;
4127 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004128 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004129 else {
4130 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004131 "path should be string, bytes, or os.PathLike, not %.200s",
4132 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004133 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004134 return 0;
4135 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004136 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004137 Py_DECREF(output);
4138 return 0;
4139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004140 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004141 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004142 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004143 Py_DECREF(output);
4144 return 0;
4145 }
4146 *(PyObject**)addr = output;
4147 return Py_CLEANUP_SUPPORTED;
4148}
4149
4150
Inada Naoki02a4d572020-02-27 13:48:59 +09004151static int unicode_fill_utf8(PyObject *unicode);
4152
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004153const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004155{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004156 if (!PyUnicode_Check(unicode)) {
4157 PyErr_BadArgument();
4158 return NULL;
4159 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004160 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004161 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004162
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004163 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004164 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004165 return NULL;
4166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167 }
4168
4169 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004170 *psize = PyUnicode_UTF8_LENGTH(unicode);
4171 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004172}
4173
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004174const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004175PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004177 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4178}
4179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004180Py_UNICODE *
4181PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183 if (!PyUnicode_Check(unicode)) {
4184 PyErr_BadArgument();
4185 return NULL;
4186 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004187 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4188 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004190 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004191 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192
Serhiy Storchakac46db922018-10-23 22:58:24 +03004193 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4194 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4195 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004198 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4199 if (w == NULL) {
4200 PyErr_NoMemory();
4201 return NULL;
4202 }
4203 unicode_copy_as_widechar(unicode, w, wlen + 1);
4204 _PyUnicode_WSTR(unicode) = w;
4205 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4206 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 }
4208 }
4209 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004210 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004211 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004212}
4213
Inada Naoki2c4928d2020-06-17 20:09:44 +09004214/* Deprecated APIs */
4215
4216_Py_COMP_DIAG_PUSH
4217_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4218
Alexander Belopolsky40018472011-02-26 01:02:56 +00004219Py_UNICODE *
4220PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004222 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223}
4224
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004225const Py_UNICODE *
4226_PyUnicode_AsUnicode(PyObject *unicode)
4227{
4228 Py_ssize_t size;
4229 const Py_UNICODE *wstr;
4230
4231 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4232 if (wstr && wcslen(wstr) != (size_t)size) {
4233 PyErr_SetString(PyExc_ValueError, "embedded null character");
4234 return NULL;
4235 }
4236 return wstr;
4237}
4238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239
Alexander Belopolsky40018472011-02-26 01:02:56 +00004240Py_ssize_t
4241PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242{
4243 if (!PyUnicode_Check(unicode)) {
4244 PyErr_BadArgument();
4245 goto onError;
4246 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004247 if (_PyUnicode_WSTR(unicode) == NULL) {
4248 if (PyUnicode_AsUnicode(unicode) == NULL)
4249 goto onError;
4250 }
4251 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 return -1;
4255}
4256
Inada Naoki2c4928d2020-06-17 20:09:44 +09004257_Py_COMP_DIAG_POP
4258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004259Py_ssize_t
4260PyUnicode_GetLength(PyObject *unicode)
4261{
Victor Stinner07621332012-06-16 04:53:46 +02004262 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 PyErr_BadArgument();
4264 return -1;
4265 }
Victor Stinner07621332012-06-16 04:53:46 +02004266 if (PyUnicode_READY(unicode) == -1)
4267 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 return PyUnicode_GET_LENGTH(unicode);
4269}
4270
4271Py_UCS4
4272PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4273{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004274 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004275 int kind;
4276
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004277 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004278 PyErr_BadArgument();
4279 return (Py_UCS4)-1;
4280 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004281 if (PyUnicode_READY(unicode) == -1) {
4282 return (Py_UCS4)-1;
4283 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004284 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004285 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 return (Py_UCS4)-1;
4287 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004288 data = PyUnicode_DATA(unicode);
4289 kind = PyUnicode_KIND(unicode);
4290 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291}
4292
4293int
4294PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4295{
4296 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004297 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004298 return -1;
4299 }
Victor Stinner488fa492011-12-12 00:01:39 +01004300 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004301 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004302 PyErr_SetString(PyExc_IndexError, "string index out of range");
4303 return -1;
4304 }
Victor Stinner488fa492011-12-12 00:01:39 +01004305 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004306 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004307 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4308 PyErr_SetString(PyExc_ValueError, "character out of range");
4309 return -1;
4310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004311 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4312 index, ch);
4313 return 0;
4314}
4315
Alexander Belopolsky40018472011-02-26 01:02:56 +00004316const char *
4317PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004318{
Victor Stinner42cb4622010-09-01 19:39:01 +00004319 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004320}
4321
Victor Stinner554f3f02010-06-16 23:33:54 +00004322/* create or adjust a UnicodeDecodeError */
4323static void
4324make_decode_exception(PyObject **exceptionObject,
4325 const char *encoding,
4326 const char *input, Py_ssize_t length,
4327 Py_ssize_t startpos, Py_ssize_t endpos,
4328 const char *reason)
4329{
4330 if (*exceptionObject == NULL) {
4331 *exceptionObject = PyUnicodeDecodeError_Create(
4332 encoding, input, length, startpos, endpos, reason);
4333 }
4334 else {
4335 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4336 goto onError;
4337 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4338 goto onError;
4339 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4340 goto onError;
4341 }
4342 return;
4343
4344onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004345 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004346}
4347
Steve Dowercc16be82016-09-08 10:35:16 -07004348#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004349static int
4350widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4351{
4352 if (newsize > *size) {
4353 wchar_t *newbuf = *buf;
4354 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4355 PyErr_NoMemory();
4356 return -1;
4357 }
4358 *buf = newbuf;
4359 }
4360 *size = newsize;
4361 return 0;
4362}
4363
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364/* error handling callback helper:
4365 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004366 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 and adjust various state variables.
4368 return 0 on success, -1 on error
4369*/
4370
Alexander Belopolsky40018472011-02-26 01:02:56 +00004371static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004372unicode_decode_call_errorhandler_wchar(
4373 const char *errors, PyObject **errorHandler,
4374 const char *encoding, const char *reason,
4375 const char **input, const char **inend, Py_ssize_t *startinpos,
4376 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004377 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004379 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380
4381 PyObject *restuple = NULL;
4382 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004383 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004384 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004385 Py_ssize_t requiredsize;
4386 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004387 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004388 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389
4390 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 *errorHandler = PyCodec_LookupError(errors);
4392 if (*errorHandler == NULL)
4393 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 }
4395
Victor Stinner554f3f02010-06-16 23:33:54 +00004396 make_decode_exception(exceptionObject,
4397 encoding,
4398 *input, *inend - *input,
4399 *startinpos, *endinpos,
4400 reason);
4401 if (*exceptionObject == NULL)
4402 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403
Petr Viktorinffd97532020-02-11 17:46:57 +01004404 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004408 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004411 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413
4414 /* Copy back the bytes variables, which might have been modified by the
4415 callback */
4416 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4417 if (!inputobj)
4418 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 *input = PyBytes_AS_STRING(inputobj);
4420 insize = PyBytes_GET_SIZE(inputobj);
4421 *inend = *input + insize;
4422 /* we can DECREF safely, as the exception has another reference,
4423 so the object won't go away. */
4424 Py_DECREF(inputobj);
4425
4426 if (newpos<0)
4427 newpos = insize+newpos;
4428 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004429 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004430 goto onError;
4431 }
4432
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004433#if USE_UNICODE_WCHAR_CACHE
4434_Py_COMP_DIAG_PUSH
4435_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4436 repwlen = PyUnicode_GetSize(repunicode);
4437 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004438 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004439_Py_COMP_DIAG_POP
4440#else /* USE_UNICODE_WCHAR_CACHE */
4441 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4442 if (repwlen < 0)
4443 goto onError;
4444 repwlen--;
4445#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 /* need more space? (at least enough for what we
4447 have+the replacement+the rest of the string (starting
4448 at the new input position), so we won't have to check space
4449 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004450 requiredsize = *outpos;
4451 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4452 goto overflow;
4453 requiredsize += repwlen;
4454 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4455 goto overflow;
4456 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004457 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004458 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004459 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004460 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004461 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004463 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004464 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004465 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004467 *endinpos = newpos;
4468 *inptr = *input + newpos;
4469
4470 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004471 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004472 return 0;
4473
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004474 overflow:
4475 PyErr_SetString(PyExc_OverflowError,
4476 "decoded result is too long for a Python string");
4477
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004478 onError:
4479 Py_XDECREF(restuple);
4480 return -1;
4481}
Steve Dowercc16be82016-09-08 10:35:16 -07004482#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483
4484static int
4485unicode_decode_call_errorhandler_writer(
4486 const char *errors, PyObject **errorHandler,
4487 const char *encoding, const char *reason,
4488 const char **input, const char **inend, Py_ssize_t *startinpos,
4489 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4490 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4491{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004492 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004493
4494 PyObject *restuple = NULL;
4495 PyObject *repunicode = NULL;
4496 Py_ssize_t insize;
4497 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004498 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004499 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004500 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004501 int need_to_grow = 0;
4502 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004503
4504 if (*errorHandler == NULL) {
4505 *errorHandler = PyCodec_LookupError(errors);
4506 if (*errorHandler == NULL)
4507 goto onError;
4508 }
4509
4510 make_decode_exception(exceptionObject,
4511 encoding,
4512 *input, *inend - *input,
4513 *startinpos, *endinpos,
4514 reason);
4515 if (*exceptionObject == NULL)
4516 goto onError;
4517
Petr Viktorinffd97532020-02-11 17:46:57 +01004518 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004519 if (restuple == NULL)
4520 goto onError;
4521 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004522 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004523 goto onError;
4524 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004525 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004526 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004527
4528 /* Copy back the bytes variables, which might have been modified by the
4529 callback */
4530 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4531 if (!inputobj)
4532 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004533 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004534 *input = PyBytes_AS_STRING(inputobj);
4535 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004536 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004537 /* we can DECREF safely, as the exception has another reference,
4538 so the object won't go away. */
4539 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004540
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004543 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004544 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547
Victor Stinner170ca6f2013-04-18 00:25:28 +02004548 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004549 if (replen > 1) {
4550 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004551 need_to_grow = 1;
4552 }
4553 new_inptr = *input + newpos;
4554 if (*inend - new_inptr > remain) {
4555 /* We don't know the decoding algorithm here so we make the worst
4556 assumption that one byte decodes to one unicode character.
4557 If unfortunately one byte could decode to more unicode characters,
4558 the decoder may write out-of-bound then. Is it possible for the
4559 algorithms using this function? */
4560 writer->min_length += *inend - new_inptr - remain;
4561 need_to_grow = 1;
4562 }
4563 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004564 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004565 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004566 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4567 goto onError;
4568 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004569 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004570 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004573 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004575 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004576 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004577 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582}
4583
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584/* --- UTF-7 Codec -------------------------------------------------------- */
4585
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586/* See RFC2152 for details. We encode conservatively and decode liberally. */
4587
4588/* Three simple macros defining base-64. */
4589
4590/* Is c a base-64 character? */
4591
4592#define IS_BASE64(c) \
4593 (((c) >= 'A' && (c) <= 'Z') || \
4594 ((c) >= 'a' && (c) <= 'z') || \
4595 ((c) >= '0' && (c) <= '9') || \
4596 (c) == '+' || (c) == '/')
4597
4598/* given that c is a base-64 character, what is its base-64 value? */
4599
4600#define FROM_BASE64(c) \
4601 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4602 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4603 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4604 (c) == '+' ? 62 : 63)
4605
4606/* What is the base-64 character of the bottom 6 bits of n? */
4607
4608#define TO_BASE64(n) \
4609 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4610
4611/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4612 * decoded as itself. We are permissive on decoding; the only ASCII
4613 * byte not decoding to itself is the + which begins a base64
4614 * string. */
4615
4616#define DECODE_DIRECT(c) \
4617 ((c) <= 127 && (c) != '+')
4618
4619/* The UTF-7 encoder treats ASCII characters differently according to
4620 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4621 * the above). See RFC2152. This array identifies these different
4622 * sets:
4623 * 0 : "Set D"
4624 * alphanumeric and '(),-./:?
4625 * 1 : "Set O"
4626 * !"#$%&*;<=>@[]^_`{|}
4627 * 2 : "whitespace"
4628 * ht nl cr sp
4629 * 3 : special (must be base64 encoded)
4630 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4631 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632
Tim Petersced69f82003-09-16 20:30:58 +00004633static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634char utf7_category[128] = {
4635/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4636 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4637/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4638 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4639/* sp ! " # $ % & ' ( ) * + , - . / */
4640 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4641/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4643/* @ A B C D E F G H I J K L M N O */
4644 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4645/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4647/* ` a b c d e f g h i j k l m n o */
4648 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4649/* p q r s t u v w x y z { | } ~ del */
4650 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004651};
4652
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653/* ENCODE_DIRECT: this character should be encoded as itself. The
4654 * answer depends on whether we are encoding set O as itself, and also
4655 * on whether we are encoding whitespace as itself. RFC2152 makes it
4656 * clear that the answers to these questions vary between
4657 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004658
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659#define ENCODE_DIRECT(c, directO, directWS) \
4660 ((c) < 128 && (c) > 0 && \
4661 ((utf7_category[(c)] == 0) || \
4662 (directWS && (utf7_category[(c)] == 2)) || \
4663 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664
Alexander Belopolsky40018472011-02-26 01:02:56 +00004665PyObject *
4666PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004667 Py_ssize_t size,
4668 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004670 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4671}
4672
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673/* The decoder. The only state we preserve is our read position,
4674 * i.e. how many characters we have consumed. So if we end in the
4675 * middle of a shift sequence we have to back off the read position
4676 * and the output to the beginning of the sequence, otherwise we lose
4677 * all the shift state (seen bits, number of bits seen, high
4678 * surrogate). */
4679
Alexander Belopolsky40018472011-02-26 01:02:56 +00004680PyObject *
4681PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004682 Py_ssize_t size,
4683 const char *errors,
4684 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004687 Py_ssize_t startinpos;
4688 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004690 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004691 const char *errmsg = "";
4692 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004693 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 unsigned int base64bits = 0;
4695 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004696 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 PyObject *errorHandler = NULL;
4698 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 if (size == 0) {
4701 if (consumed)
4702 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004703 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004704 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004705
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004707 _PyUnicodeWriter_Init(&writer);
4708 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004709
4710 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711 e = s + size;
4712
4713 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004714 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004716 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717
Antoine Pitrou244651a2009-05-04 18:56:13 +00004718 if (inShift) { /* in a base-64 section */
4719 if (IS_BASE64(ch)) { /* consume a base-64 character */
4720 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4721 base64bits += 6;
4722 s++;
4723 if (base64bits >= 16) {
4724 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004725 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 base64bits -= 16;
4727 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004728 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 if (surrogate) {
4730 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004731 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4732 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004733 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004734 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004735 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004736 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 }
4738 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004739 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004740 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 }
4743 }
Victor Stinner551ac952011-11-29 22:58:13 +01004744 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004745 /* first surrogate */
4746 surrogate = outCh;
4747 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004748 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004749 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004750 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 }
4752 }
4753 }
4754 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 if (base64bits > 0) { /* left-over bits */
4757 if (base64bits >= 6) {
4758 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004759 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 errmsg = "partial character in shift sequence";
4761 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 else {
4764 /* Some bits remain; they should be zero */
4765 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004766 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 errmsg = "non-zero padding bits in shift sequence";
4768 goto utf7Error;
4769 }
4770 }
4771 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004772 if (surrogate && DECODE_DIRECT(ch)) {
4773 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4774 goto onError;
4775 }
4776 surrogate = 0;
4777 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 /* '-' is absorbed; other terminating
4779 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004780 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004781 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 }
4783 }
4784 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 s++; /* consume '+' */
4787 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004789 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004790 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004792 else if (s < e && !IS_BASE64(*s)) {
4793 s++;
4794 errmsg = "ill-formed sequence";
4795 goto utf7Error;
4796 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004798 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004799 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004800 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004802 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004803 }
4804 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004805 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004807 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004808 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004809 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 else {
4811 startinpos = s-starts;
4812 s++;
4813 errmsg = "unexpected special character";
4814 goto utf7Error;
4815 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004816 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004819 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 errors, &errorHandler,
4821 "utf7", errmsg,
4822 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004823 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004825 }
4826
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 /* end of string */
4828
4829 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4830 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004831 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 if (surrogate ||
4833 (base64bits >= 6) ||
4834 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004836 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004837 errors, &errorHandler,
4838 "utf7", "unterminated shift sequence",
4839 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004840 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 goto onError;
4842 if (s < e)
4843 goto restart;
4844 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846
4847 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004848 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004849 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004850 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004851 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004852 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004853 writer.kind, writer.data, shiftOutStart);
4854 Py_XDECREF(errorHandler);
4855 Py_XDECREF(exc);
4856 _PyUnicodeWriter_Dealloc(&writer);
4857 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004858 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004859 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004860 }
4861 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004862 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004864 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004865
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004866 Py_XDECREF(errorHandler);
4867 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004868 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004869
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 Py_XDECREF(errorHandler);
4872 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004873 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874 return NULL;
4875}
4876
4877
Alexander Belopolsky40018472011-02-26 01:02:56 +00004878PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004879_PyUnicode_EncodeUTF7(PyObject *str,
4880 int base64SetO,
4881 int base64WhiteSpace,
4882 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004883{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004884 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004885 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004886 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004887 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004888 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004889 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004890 unsigned int base64bits = 0;
4891 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004892 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004893 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004894
Benjamin Petersonbac79492012-01-14 13:34:47 -05004895 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004896 return NULL;
4897 kind = PyUnicode_KIND(str);
4898 data = PyUnicode_DATA(str);
4899 len = PyUnicode_GET_LENGTH(str);
4900
4901 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004903
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004904 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004905 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004906 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004907 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004908 if (v == NULL)
4909 return NULL;
4910
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004911 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004912 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004913 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004914
Antoine Pitrou244651a2009-05-04 18:56:13 +00004915 if (inShift) {
4916 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4917 /* shifting out */
4918 if (base64bits) { /* output remaining bits */
4919 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4920 base64buffer = 0;
4921 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004922 }
4923 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004924 /* Characters not in the BASE64 set implicitly unshift the sequence
4925 so no '-' is required, except if the character is itself a '-' */
4926 if (IS_BASE64(ch) || ch == '-') {
4927 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004928 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004929 *out++ = (char) ch;
4930 }
4931 else {
4932 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004933 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004934 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004935 else { /* not in a shift sequence */
4936 if (ch == '+') {
4937 *out++ = '+';
4938 *out++ = '-';
4939 }
4940 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4941 *out++ = (char) ch;
4942 }
4943 else {
4944 *out++ = '+';
4945 inShift = 1;
4946 goto encode_char;
4947 }
4948 }
4949 continue;
4950encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004951 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004952 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004953
Antoine Pitrou244651a2009-05-04 18:56:13 +00004954 /* code first surrogate */
4955 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004956 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004957 while (base64bits >= 6) {
4958 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4959 base64bits -= 6;
4960 }
4961 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004962 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004963 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004964 base64bits += 16;
4965 base64buffer = (base64buffer << 16) | ch;
4966 while (base64bits >= 6) {
4967 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4968 base64bits -= 6;
4969 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004970 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004971 if (base64bits)
4972 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4973 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004974 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004975 if (_PyBytes_Resize(&v, out - start) < 0)
4976 return NULL;
4977 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004978}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004979PyObject *
4980PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4981 Py_ssize_t size,
4982 int base64SetO,
4983 int base64WhiteSpace,
4984 const char *errors)
4985{
4986 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004987 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004988 if (tmp == NULL)
4989 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004990 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004991 base64WhiteSpace, errors);
4992 Py_DECREF(tmp);
4993 return result;
4994}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004995
Antoine Pitrou244651a2009-05-04 18:56:13 +00004996#undef IS_BASE64
4997#undef FROM_BASE64
4998#undef TO_BASE64
4999#undef DECODE_DIRECT
5000#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005001
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002/* --- UTF-8 Codec -------------------------------------------------------- */
5003
Alexander Belopolsky40018472011-02-26 01:02:56 +00005004PyObject *
5005PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005006 Py_ssize_t size,
5007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008{
Walter Dörwald69652032004-09-07 20:24:22 +00005009 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5010}
5011
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012#include "stringlib/asciilib.h"
5013#include "stringlib/codecs.h"
5014#include "stringlib/undef.h"
5015
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005016#include "stringlib/ucs1lib.h"
5017#include "stringlib/codecs.h"
5018#include "stringlib/undef.h"
5019
5020#include "stringlib/ucs2lib.h"
5021#include "stringlib/codecs.h"
5022#include "stringlib/undef.h"
5023
5024#include "stringlib/ucs4lib.h"
5025#include "stringlib/codecs.h"
5026#include "stringlib/undef.h"
5027
Antoine Pitrouab868312009-01-10 15:40:25 +00005028/* Mask to quickly check whether a C 'long' contains a
5029 non-ASCII, UTF8-encoded char. */
5030#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005031# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005032#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005033# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005034#else
5035# error C 'long' size should be either 4 or 8!
5036#endif
5037
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038static Py_ssize_t
5039ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005041 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005042 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005043
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005044 /*
5045 * Issue #17237: m68k is a bit different from most architectures in
5046 * that objects do not use "natural alignment" - for example, int and
5047 * long are only aligned at 2-byte boundaries. Therefore the assert()
5048 * won't work; also, tests have shown that skipping the "optimised
5049 * version" will even speed up m68k.
5050 */
5051#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005052#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005053 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
5054 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005055 /* Fast path, see in STRINGLIB(utf8_decode) for
5056 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005057 /* Help allocation */
5058 const char *_p = p;
5059 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 while (_p < aligned_end) {
5061 unsigned long value = *(const unsigned long *) _p;
5062 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064 *((unsigned long *)q) = value;
5065 _p += SIZEOF_LONG;
5066 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005067 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 p = _p;
5069 while (p < end) {
5070 if ((unsigned char)*p & 0x80)
5071 break;
5072 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005077#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078 while (p < end) {
5079 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5080 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005081 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005082 /* Help allocation */
5083 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06005085 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086 if (value & ASCII_CHAR_MASK)
5087 break;
5088 _p += SIZEOF_LONG;
5089 }
5090 p = _p;
5091 if (_p == end)
5092 break;
5093 }
5094 if ((unsigned char)*p & 0x80)
5095 break;
5096 ++p;
5097 }
5098 memcpy(dest, start, p - start);
5099 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100}
Antoine Pitrouab868312009-01-10 15:40:25 +00005101
Victor Stinner709d23d2019-05-02 14:56:30 -04005102static PyObject *
5103unicode_decode_utf8(const char *s, Py_ssize_t size,
5104 _Py_error_handler error_handler, const char *errors,
5105 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005106{
Victor Stinner785938e2011-12-11 20:09:03 +01005107 if (size == 0) {
5108 if (consumed)
5109 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005110 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005111 }
5112
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5114 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005115 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005117 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005118 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005119 }
5120
Inada Naoki770847a2019-06-24 12:30:24 +09005121 const char *starts = s;
5122 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005123
Inada Naoki770847a2019-06-24 12:30:24 +09005124 // fast path: try ASCII string.
5125 PyObject *u = PyUnicode_New(size, 127);
5126 if (u == NULL) {
5127 return NULL;
5128 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005129 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005130 if (s == end) {
5131 return u;
5132 }
5133
5134 // Use _PyUnicodeWriter after fast path is failed.
5135 _PyUnicodeWriter writer;
5136 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5137 writer.pos = s - starts;
5138
5139 Py_ssize_t startinpos, endinpos;
5140 const char *errmsg = "";
5141 PyObject *error_handler_obj = NULL;
5142 PyObject *exc = NULL;
5143
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144 while (s < end) {
5145 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005146 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005147
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005149 if (PyUnicode_IS_ASCII(writer.buffer))
5150 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005152 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005154 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 } else {
5156 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005157 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158 }
5159
5160 switch (ch) {
5161 case 0:
5162 if (s == end || consumed)
5163 goto End;
5164 errmsg = "unexpected end of data";
5165 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005166 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005167 break;
5168 case 1:
5169 errmsg = "invalid start byte";
5170 startinpos = s - starts;
5171 endinpos = startinpos + 1;
5172 break;
5173 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005174 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5175 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5176 {
5177 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005178 goto End;
5179 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005180 /* fall through */
5181 case 3:
5182 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005183 errmsg = "invalid continuation byte";
5184 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005185 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 break;
5187 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005188 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005189 goto onError;
5190 continue;
5191 }
5192
Victor Stinner1d65d912015-10-05 13:43:50 +02005193 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005194 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005195
5196 switch (error_handler) {
5197 case _Py_ERROR_IGNORE:
5198 s += (endinpos - startinpos);
5199 break;
5200
5201 case _Py_ERROR_REPLACE:
5202 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5203 goto onError;
5204 s += (endinpos - startinpos);
5205 break;
5206
5207 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005208 {
5209 Py_ssize_t i;
5210
Victor Stinner1d65d912015-10-05 13:43:50 +02005211 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5212 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005213 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005214 ch = (Py_UCS4)(unsigned char)(starts[i]);
5215 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5216 ch + 0xdc00);
5217 writer.pos++;
5218 }
5219 s += (endinpos - startinpos);
5220 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005221 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005222
5223 default:
5224 if (unicode_decode_call_errorhandler_writer(
5225 errors, &error_handler_obj,
5226 "utf-8", errmsg,
5227 &starts, &end, &startinpos, &endinpos, &exc, &s,
5228 &writer))
5229 goto onError;
5230 }
Victor Stinner785938e2011-12-11 20:09:03 +01005231 }
5232
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005233End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005234 if (consumed)
5235 *consumed = s - starts;
5236
Victor Stinner1d65d912015-10-05 13:43:50 +02005237 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005238 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005239 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005240
5241onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005242 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005243 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005244 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005245 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005246}
5247
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005248
Victor Stinner709d23d2019-05-02 14:56:30 -04005249PyObject *
5250PyUnicode_DecodeUTF8Stateful(const char *s,
5251 Py_ssize_t size,
5252 const char *errors,
5253 Py_ssize_t *consumed)
5254{
5255 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5256}
5257
5258
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005259/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5260 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005261
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005262 On success, write a pointer to a newly allocated wide character string into
5263 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5264 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005265
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005266 On memory allocation failure, return -1.
5267
5268 On decoding error (if surrogateescape is zero), return -2. If wlen is
5269 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5270 is not NULL, write the decoding error message into *reason. */
5271int
5272_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005273 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005274{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005276 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005277 wchar_t *unicode;
5278 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005279
Victor Stinner3d4226a2018-08-29 22:21:32 +02005280 int surrogateescape = 0;
5281 int surrogatepass = 0;
5282 switch (errors)
5283 {
5284 case _Py_ERROR_STRICT:
5285 break;
5286 case _Py_ERROR_SURROGATEESCAPE:
5287 surrogateescape = 1;
5288 break;
5289 case _Py_ERROR_SURROGATEPASS:
5290 surrogatepass = 1;
5291 break;
5292 default:
5293 return -3;
5294 }
5295
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005296 /* Note: size will always be longer than the resulting Unicode
5297 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005298 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005299 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005300 }
5301
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005302 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005303 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005305 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005306
5307 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005308 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005309 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005310 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005311 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005312#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005313 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005314#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005315 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005316#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005317 if (ch > 0xFF) {
5318#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005319 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005320#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005321 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005322 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005323 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5324 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5325#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005326 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005327 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005328 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005329 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005330 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005331
5332 if (surrogateescape) {
5333 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5334 }
5335 else {
5336 /* Is it a valid three-byte code? */
5337 if (surrogatepass
5338 && (e - s) >= 3
5339 && (s[0] & 0xf0) == 0xe0
5340 && (s[1] & 0xc0) == 0x80
5341 && (s[2] & 0xc0) == 0x80)
5342 {
5343 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5344 s += 3;
5345 unicode[outpos++] = ch;
5346 }
5347 else {
5348 PyMem_RawFree(unicode );
5349 if (reason != NULL) {
5350 switch (ch) {
5351 case 0:
5352 *reason = "unexpected end of data";
5353 break;
5354 case 1:
5355 *reason = "invalid start byte";
5356 break;
5357 /* 2, 3, 4 */
5358 default:
5359 *reason = "invalid continuation byte";
5360 break;
5361 }
5362 }
5363 if (wlen != NULL) {
5364 *wlen = s - orig_s;
5365 }
5366 return -2;
5367 }
5368 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005369 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005370 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005371 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005372 if (wlen) {
5373 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005374 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005375 *wstr = unicode;
5376 return 0;
5377}
5378
Victor Stinner5f9cf232019-03-19 01:46:25 +01005379
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005380wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005381_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5382 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005383{
5384 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005385 int res = _Py_DecodeUTF8Ex(arg, arglen,
5386 &wstr, wlen,
5387 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005388 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005389 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5390 assert(res != -3);
5391 if (wlen) {
5392 *wlen = (size_t)res;
5393 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005394 return NULL;
5395 }
5396 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005397}
5398
Antoine Pitrouab868312009-01-10 15:40:25 +00005399
Victor Stinnere47e6982017-12-21 15:45:16 +01005400/* UTF-8 encoder using the surrogateescape error handler .
5401
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005402 On success, return 0 and write the newly allocated character string (use
5403 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005404
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005405 On encoding failure, return -2 and write the position of the invalid
5406 surrogate character into *error_pos (if error_pos is set) and the decoding
5407 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005408
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005409 On memory allocation failure, return -1. */
5410int
5411_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005412 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005413{
5414 const Py_ssize_t max_char_size = 4;
5415 Py_ssize_t len = wcslen(text);
5416
5417 assert(len >= 0);
5418
Victor Stinner3d4226a2018-08-29 22:21:32 +02005419 int surrogateescape = 0;
5420 int surrogatepass = 0;
5421 switch (errors)
5422 {
5423 case _Py_ERROR_STRICT:
5424 break;
5425 case _Py_ERROR_SURROGATEESCAPE:
5426 surrogateescape = 1;
5427 break;
5428 case _Py_ERROR_SURROGATEPASS:
5429 surrogatepass = 1;
5430 break;
5431 default:
5432 return -3;
5433 }
5434
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005435 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5436 return -1;
5437 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005438 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005439 if (raw_malloc) {
5440 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005441 }
5442 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005443 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005444 }
5445 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005446 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005447 }
5448
5449 char *p = bytes;
5450 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005451 for (i = 0; i < len; ) {
5452 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005453 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005454 i++;
5455#if Py_UNICODE_SIZE == 2
5456 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5457 && i < len
5458 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5459 {
5460 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5461 i++;
5462 }
5463#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005464
5465 if (ch < 0x80) {
5466 /* Encode ASCII */
5467 *p++ = (char) ch;
5468
5469 }
5470 else if (ch < 0x0800) {
5471 /* Encode Latin-1 */
5472 *p++ = (char)(0xc0 | (ch >> 6));
5473 *p++ = (char)(0x80 | (ch & 0x3f));
5474 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005475 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005476 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005477 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005478 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005479 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005480 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005481 if (reason != NULL) {
5482 *reason = "encoding error";
5483 }
5484 if (raw_malloc) {
5485 PyMem_RawFree(bytes);
5486 }
5487 else {
5488 PyMem_Free(bytes);
5489 }
5490 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005491 }
5492 *p++ = (char)(ch & 0xff);
5493 }
5494 else if (ch < 0x10000) {
5495 *p++ = (char)(0xe0 | (ch >> 12));
5496 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497 *p++ = (char)(0x80 | (ch & 0x3f));
5498 }
5499 else { /* ch >= 0x10000 */
5500 assert(ch <= MAX_UNICODE);
5501 /* Encode UCS4 Unicode ordinals */
5502 *p++ = (char)(0xf0 | (ch >> 18));
5503 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5504 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5505 *p++ = (char)(0x80 | (ch & 0x3f));
5506 }
5507 }
5508 *p++ = '\0';
5509
5510 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005511 char *bytes2;
5512 if (raw_malloc) {
5513 bytes2 = PyMem_RawRealloc(bytes, final_size);
5514 }
5515 else {
5516 bytes2 = PyMem_Realloc(bytes, final_size);
5517 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005518 if (bytes2 == NULL) {
5519 if (error_pos != NULL) {
5520 *error_pos = (size_t)-1;
5521 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005522 if (raw_malloc) {
5523 PyMem_RawFree(bytes);
5524 }
5525 else {
5526 PyMem_Free(bytes);
5527 }
5528 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005529 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005530 *str = bytes2;
5531 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005532}
5533
5534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535/* Primary internal function which creates utf8 encoded bytes objects.
5536
5537 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005538 and allocate exactly as much space needed at the end. Else allocate the
5539 maximum possible needed (4 result bytes per Unicode character), and return
5540 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005541*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005542static PyObject *
5543unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5544 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 if (!PyUnicode_Check(unicode)) {
5547 PyErr_BadArgument();
5548 return NULL;
5549 }
5550
5551 if (PyUnicode_READY(unicode) == -1)
5552 return NULL;
5553
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005554 if (PyUnicode_UTF8(unicode))
5555 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5556 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557
Inada Naoki02a4d572020-02-27 13:48:59 +09005558 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005559 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005560 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5561
5562 _PyBytesWriter writer;
5563 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564
Benjamin Petersonead6b532011-12-20 17:23:42 -06005565 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005566 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005567 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005568 case PyUnicode_1BYTE_KIND:
5569 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5570 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005571 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5572 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005573 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005574 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5575 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005576 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005577 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5578 break;
Tim Peters602f7402002-04-27 18:03:26 +00005579 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005580
5581 if (end == NULL) {
5582 _PyBytesWriter_Dealloc(&writer);
5583 return NULL;
5584 }
5585 return _PyBytesWriter_Finish(&writer, end);
5586}
5587
5588static int
5589unicode_fill_utf8(PyObject *unicode)
5590{
5591 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5592 assert(!PyUnicode_IS_ASCII(unicode));
5593
5594 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005595 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005596 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5597
5598 _PyBytesWriter writer;
5599 char *end;
5600
5601 switch (kind) {
5602 default:
5603 Py_UNREACHABLE();
5604 case PyUnicode_1BYTE_KIND:
5605 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5606 _Py_ERROR_STRICT, NULL);
5607 break;
5608 case PyUnicode_2BYTE_KIND:
5609 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5610 _Py_ERROR_STRICT, NULL);
5611 break;
5612 case PyUnicode_4BYTE_KIND:
5613 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5614 _Py_ERROR_STRICT, NULL);
5615 break;
5616 }
5617 if (end == NULL) {
5618 _PyBytesWriter_Dealloc(&writer);
5619 return -1;
5620 }
5621
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005622 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005623 PyBytes_AS_STRING(writer.buffer);
5624 Py_ssize_t len = end - start;
5625
5626 char *cache = PyObject_MALLOC(len + 1);
5627 if (cache == NULL) {
5628 _PyBytesWriter_Dealloc(&writer);
5629 PyErr_NoMemory();
5630 return -1;
5631 }
5632 _PyUnicode_UTF8(unicode) = cache;
5633 _PyUnicode_UTF8_LENGTH(unicode) = len;
5634 memcpy(cache, start, len);
5635 cache[len] = '\0';
5636 _PyBytesWriter_Dealloc(&writer);
5637 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638}
5639
Alexander Belopolsky40018472011-02-26 01:02:56 +00005640PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005641_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5642{
5643 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5644}
5645
5646
5647PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5649 Py_ssize_t size,
5650 const char *errors)
5651{
5652 PyObject *v, *unicode;
5653
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005654 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655 if (unicode == NULL)
5656 return NULL;
5657 v = _PyUnicode_AsUTF8String(unicode, errors);
5658 Py_DECREF(unicode);
5659 return v;
5660}
5661
5662PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005663PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666}
5667
Walter Dörwald41980ca2007-08-16 21:55:45 +00005668/* --- UTF-32 Codec ------------------------------------------------------- */
5669
5670PyObject *
5671PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 Py_ssize_t size,
5673 const char *errors,
5674 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005675{
5676 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5677}
5678
5679PyObject *
5680PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 Py_ssize_t size,
5682 const char *errors,
5683 int *byteorder,
5684 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005685{
5686 const char *starts = s;
5687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005690 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005691 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005692 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005693 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005694 PyObject *errorHandler = NULL;
5695 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005696
Andy Lestere6be9b52020-02-11 20:28:35 -06005697 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698 e = q + size;
5699
5700 if (byteorder)
5701 bo = *byteorder;
5702
5703 /* Check for BOM marks (U+FEFF) in the input and adjust current
5704 byte order setting accordingly. In native mode, the leading BOM
5705 mark is skipped, in all other modes, it is copied to the output
5706 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005707 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005708 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005709 if (bom == 0x0000FEFF) {
5710 bo = -1;
5711 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005713 else if (bom == 0xFFFE0000) {
5714 bo = 1;
5715 q += 4;
5716 }
5717 if (byteorder)
5718 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005719 }
5720
Victor Stinnere64322e2012-10-30 23:12:47 +01005721 if (q == e) {
5722 if (consumed)
5723 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005724 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005725 }
5726
Victor Stinnere64322e2012-10-30 23:12:47 +01005727#ifdef WORDS_BIGENDIAN
5728 le = bo < 0;
5729#else
5730 le = bo <= 0;
5731#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005733
Victor Stinner8f674cc2013-04-17 23:02:17 +02005734 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005735 writer.min_length = (e - q + 3) / 4;
5736 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005737 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005738
Victor Stinnere64322e2012-10-30 23:12:47 +01005739 while (1) {
5740 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005741 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005742
Victor Stinnere64322e2012-10-30 23:12:47 +01005743 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005744 enum PyUnicode_Kind kind = writer.kind;
5745 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005746 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005747 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005748 if (le) {
5749 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005750 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005751 if (ch > maxch)
5752 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 if (kind != PyUnicode_1BYTE_KIND &&
5754 Py_UNICODE_IS_SURROGATE(ch))
5755 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005756 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005757 q += 4;
5758 } while (q <= last);
5759 }
5760 else {
5761 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005762 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005763 if (ch > maxch)
5764 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 if (kind != PyUnicode_1BYTE_KIND &&
5766 Py_UNICODE_IS_SURROGATE(ch))
5767 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005769 q += 4;
5770 } while (q <= last);
5771 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005773 }
5774
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005776 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 startinpos = ((const char *)q) - starts;
5778 endinpos = startinpos + 4;
5779 }
5780 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005781 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005783 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005785 startinpos = ((const char *)q) - starts;
5786 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005788 else {
5789 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005790 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005791 goto onError;
5792 q += 4;
5793 continue;
5794 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005795 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005796 startinpos = ((const char *)q) - starts;
5797 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005799
5800 /* The remaining input chars are ignored if the callback
5801 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005804 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005806 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005808 }
5809
Walter Dörwald41980ca2007-08-16 21:55:45 +00005810 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005812
Walter Dörwald41980ca2007-08-16 21:55:45 +00005813 Py_XDECREF(errorHandler);
5814 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005815 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005816
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005818 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005819 Py_XDECREF(errorHandler);
5820 Py_XDECREF(exc);
5821 return NULL;
5822}
5823
5824PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005825_PyUnicode_EncodeUTF32(PyObject *str,
5826 const char *errors,
5827 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005828{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005829 enum PyUnicode_Kind kind;
5830 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005831 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005833 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005834#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005835 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005836#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005837 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005838#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005840 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005841 PyObject *errorHandler = NULL;
5842 PyObject *exc = NULL;
5843 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005844
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005845 if (!PyUnicode_Check(str)) {
5846 PyErr_BadArgument();
5847 return NULL;
5848 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005849 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850 return NULL;
5851 kind = PyUnicode_KIND(str);
5852 data = PyUnicode_DATA(str);
5853 len = PyUnicode_GET_LENGTH(str);
5854
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005855 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005856 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005857 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005858 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859 if (v == NULL)
5860 return NULL;
5861
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005862 /* output buffer is 4-bytes aligned */
5863 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005864 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005865 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005866 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005868 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005869
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005870 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005871 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005872 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005873 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005874 else
5875 encoding = "utf-32";
5876
5877 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005878 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5879 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005880 }
5881
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005882 pos = 0;
5883 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005884 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005885
5886 if (kind == PyUnicode_2BYTE_KIND) {
5887 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5888 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005889 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005890 else {
5891 assert(kind == PyUnicode_4BYTE_KIND);
5892 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5893 &out, native_ordering);
5894 }
5895 if (pos == len)
5896 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005897
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005898 rep = unicode_encode_call_errorhandler(
5899 errors, &errorHandler,
5900 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005901 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005902 if (!rep)
5903 goto error;
5904
5905 if (PyBytes_Check(rep)) {
5906 repsize = PyBytes_GET_SIZE(rep);
5907 if (repsize & 3) {
5908 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005909 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005910 "surrogates not allowed");
5911 goto error;
5912 }
5913 moreunits = repsize / 4;
5914 }
5915 else {
5916 assert(PyUnicode_Check(rep));
5917 if (PyUnicode_READY(rep) < 0)
5918 goto error;
5919 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5920 if (!PyUnicode_IS_ASCII(rep)) {
5921 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005922 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005923 "surrogates not allowed");
5924 goto error;
5925 }
5926 }
5927
5928 /* four bytes are reserved for each surrogate */
5929 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005930 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005931 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005932 /* integer overflow */
5933 PyErr_NoMemory();
5934 goto error;
5935 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005936 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005937 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005938 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005939 }
5940
5941 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005942 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005943 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005944 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005945 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005946 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5947 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005948 }
5949
5950 Py_CLEAR(rep);
5951 }
5952
5953 /* Cut back to size actually needed. This is necessary for, for example,
5954 encoding of a string containing isolated surrogates and the 'ignore'
5955 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005956 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005957 if (nsize != PyBytes_GET_SIZE(v))
5958 _PyBytes_Resize(&v, nsize);
5959 Py_XDECREF(errorHandler);
5960 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005961 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005962 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005963 error:
5964 Py_XDECREF(rep);
5965 Py_XDECREF(errorHandler);
5966 Py_XDECREF(exc);
5967 Py_XDECREF(v);
5968 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005969}
5970
Alexander Belopolsky40018472011-02-26 01:02:56 +00005971PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5973 Py_ssize_t size,
5974 const char *errors,
5975 int byteorder)
5976{
5977 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005978 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 if (tmp == NULL)
5980 return NULL;
5981 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5982 Py_DECREF(tmp);
5983 return result;
5984}
5985
5986PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005987PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005988{
Victor Stinnerb960b342011-11-20 19:12:52 +01005989 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005990}
5991
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992/* --- UTF-16 Codec ------------------------------------------------------- */
5993
Tim Peters772747b2001-08-09 22:21:55 +00005994PyObject *
5995PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 Py_ssize_t size,
5997 const char *errors,
5998 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999{
Walter Dörwald69652032004-09-07 20:24:22 +00006000 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6001}
6002
6003PyObject *
6004PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 Py_ssize_t size,
6006 const char *errors,
6007 int *byteorder,
6008 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006011 Py_ssize_t startinpos;
6012 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006013 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006014 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006015 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006016 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006017 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 PyObject *errorHandler = NULL;
6019 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006020 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Andy Lestere6be9b52020-02-11 20:28:35 -06006022 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006023 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
6025 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006026 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006028 /* Check for BOM marks (U+FEFF) in the input and adjust current
6029 byte order setting accordingly. In native mode, the leading BOM
6030 mark is skipped, in all other modes, it is copied to the output
6031 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006032 if (bo == 0 && size >= 2) {
6033 const Py_UCS4 bom = (q[1] << 8) | q[0];
6034 if (bom == 0xFEFF) {
6035 q += 2;
6036 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006038 else if (bom == 0xFFFE) {
6039 q += 2;
6040 bo = 1;
6041 }
6042 if (byteorder)
6043 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
Antoine Pitrou63065d72012-05-15 23:48:04 +02006046 if (q == e) {
6047 if (consumed)
6048 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006049 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006050 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006051
Christian Heimes743e0cd2012-10-17 23:52:17 +02006052#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006053 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006054 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006055#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006056 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006057 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006058#endif
Tim Peters772747b2001-08-09 22:21:55 +00006059
Antoine Pitrou63065d72012-05-15 23:48:04 +02006060 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006061 character count normally. Error handler will take care of
6062 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006063 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006064 writer.min_length = (e - q + 1) / 2;
6065 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006066 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006067
Antoine Pitrou63065d72012-05-15 23:48:04 +02006068 while (1) {
6069 Py_UCS4 ch = 0;
6070 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006072 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006073 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006074 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006075 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006076 native_ordering);
6077 else
6078 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006080 native_ordering);
6081 } else if (kind == PyUnicode_2BYTE_KIND) {
6082 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006083 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 native_ordering);
6085 } else {
6086 assert(kind == PyUnicode_4BYTE_KIND);
6087 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006089 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006090 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 switch (ch)
6094 {
6095 case 0:
6096 /* remaining byte at the end? (size should be even) */
6097 if (q == e || consumed)
6098 goto End;
6099 errmsg = "truncated data";
6100 startinpos = ((const char *)q) - starts;
6101 endinpos = ((const char *)e) - starts;
6102 break;
6103 /* The remaining input chars are ignored if the callback
6104 chooses to skip the input */
6105 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006106 q -= 2;
6107 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006108 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006109 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006110 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006111 endinpos = ((const char *)e) - starts;
6112 break;
6113 case 2:
6114 errmsg = "illegal encoding";
6115 startinpos = ((const char *)q) - 2 - starts;
6116 endinpos = startinpos + 2;
6117 break;
6118 case 3:
6119 errmsg = "illegal UTF-16 surrogate";
6120 startinpos = ((const char *)q) - 4 - starts;
6121 endinpos = startinpos + 2;
6122 break;
6123 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006124 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006125 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 continue;
6127 }
6128
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006129 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006130 errors,
6131 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006132 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006133 &starts,
6134 (const char **)&e,
6135 &startinpos,
6136 &endinpos,
6137 &exc,
6138 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006139 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 }
6142
Antoine Pitrou63065d72012-05-15 23:48:04 +02006143End:
Walter Dörwald69652032004-09-07 20:24:22 +00006144 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006149 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006152 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153 Py_XDECREF(errorHandler);
6154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 return NULL;
6156}
6157
Tim Peters772747b2001-08-09 22:21:55 +00006158PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159_PyUnicode_EncodeUTF16(PyObject *str,
6160 const char *errors,
6161 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006163 enum PyUnicode_Kind kind;
6164 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006166 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006167 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006168 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006169#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006170 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006171#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006172 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006173#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006174 const char *encoding;
6175 Py_ssize_t nsize, pos;
6176 PyObject *errorHandler = NULL;
6177 PyObject *exc = NULL;
6178 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 if (!PyUnicode_Check(str)) {
6181 PyErr_BadArgument();
6182 return NULL;
6183 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006184 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 return NULL;
6186 kind = PyUnicode_KIND(str);
6187 data = PyUnicode_DATA(str);
6188 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006189
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006191 if (kind == PyUnicode_4BYTE_KIND) {
6192 const Py_UCS4 *in = (const Py_UCS4 *)data;
6193 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006194 while (in < end) {
6195 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006197 }
6198 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006199 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006200 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006202 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006203 nsize = len + pairs + (byteorder == 0);
6204 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006205 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006209 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006210 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006211 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006212 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006213 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006214 }
6215 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006216 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006217 }
Tim Peters772747b2001-08-09 22:21:55 +00006218
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006219 if (kind == PyUnicode_1BYTE_KIND) {
6220 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6221 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006222 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006223
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006224 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006225 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006226 }
6227 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006228 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006229 }
6230 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006231 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006232 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006233
6234 pos = 0;
6235 while (pos < len) {
6236 Py_ssize_t repsize, moreunits;
6237
6238 if (kind == PyUnicode_2BYTE_KIND) {
6239 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6240 &out, native_ordering);
6241 }
6242 else {
6243 assert(kind == PyUnicode_4BYTE_KIND);
6244 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6245 &out, native_ordering);
6246 }
6247 if (pos == len)
6248 break;
6249
6250 rep = unicode_encode_call_errorhandler(
6251 errors, &errorHandler,
6252 encoding, "surrogates not allowed",
6253 str, &exc, pos, pos + 1, &pos);
6254 if (!rep)
6255 goto error;
6256
6257 if (PyBytes_Check(rep)) {
6258 repsize = PyBytes_GET_SIZE(rep);
6259 if (repsize & 1) {
6260 raise_encode_exception(&exc, encoding,
6261 str, pos - 1, pos,
6262 "surrogates not allowed");
6263 goto error;
6264 }
6265 moreunits = repsize / 2;
6266 }
6267 else {
6268 assert(PyUnicode_Check(rep));
6269 if (PyUnicode_READY(rep) < 0)
6270 goto error;
6271 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6272 if (!PyUnicode_IS_ASCII(rep)) {
6273 raise_encode_exception(&exc, encoding,
6274 str, pos - 1, pos,
6275 "surrogates not allowed");
6276 goto error;
6277 }
6278 }
6279
6280 /* two bytes are reserved for each surrogate */
6281 if (moreunits > 1) {
6282 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006283 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006284 /* integer overflow */
6285 PyErr_NoMemory();
6286 goto error;
6287 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006288 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006289 goto error;
6290 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6291 }
6292
6293 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006294 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006295 out += moreunits;
6296 } else /* rep is unicode */ {
6297 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6298 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6299 &out, native_ordering);
6300 }
6301
6302 Py_CLEAR(rep);
6303 }
6304
6305 /* Cut back to size actually needed. This is necessary for, for example,
6306 encoding of a string containing isolated surrogates and the 'ignore' handler
6307 is used. */
6308 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6309 if (nsize != PyBytes_GET_SIZE(v))
6310 _PyBytes_Resize(&v, nsize);
6311 Py_XDECREF(errorHandler);
6312 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006313 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006314 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006315 error:
6316 Py_XDECREF(rep);
6317 Py_XDECREF(errorHandler);
6318 Py_XDECREF(exc);
6319 Py_XDECREF(v);
6320 return NULL;
6321#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Alexander Belopolsky40018472011-02-26 01:02:56 +00006324PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6326 Py_ssize_t size,
6327 const char *errors,
6328 int byteorder)
6329{
6330 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006331 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006332 if (tmp == NULL)
6333 return NULL;
6334 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6335 Py_DECREF(tmp);
6336 return result;
6337}
6338
6339PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006340PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
6345/* --- Unicode Escape Codec ----------------------------------------------- */
6346
Fredrik Lundh06d12682001-01-24 07:59:11 +00006347static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006348
Alexander Belopolsky40018472011-02-26 01:02:56 +00006349PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006350_PyUnicode_DecodeUnicodeEscape(const char *s,
6351 Py_ssize_t size,
6352 const char *errors,
6353 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006356 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 PyObject *errorHandler = NULL;
6359 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006360
Eric V. Smith42454af2016-10-31 09:22:08 -04006361 // so we can remember if we've seen an invalid escape char or not
6362 *first_invalid_escape = NULL;
6363
Victor Stinner62ec3312016-09-06 17:04:34 -07006364 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006365 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 }
6367 /* Escaped strings will always be longer than the resulting
6368 Unicode string, so we start with size here and then reduce the
6369 length after conversion to the true value.
6370 (but if the error callback returns a long replacement string
6371 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006372 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 writer.min_length = size;
6374 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006376 }
6377
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 end = s + size;
6379 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 unsigned char c = (unsigned char) *s++;
6381 Py_UCS4 ch;
6382 int count;
6383 Py_ssize_t startinpos;
6384 Py_ssize_t endinpos;
6385 const char *message;
6386
6387#define WRITE_ASCII_CHAR(ch) \
6388 do { \
6389 assert(ch <= 127); \
6390 assert(writer.pos < writer.size); \
6391 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6392 } while(0)
6393
6394#define WRITE_CHAR(ch) \
6395 do { \
6396 if (ch <= writer.maxchar) { \
6397 assert(writer.pos < writer.size); \
6398 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6399 } \
6400 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6401 goto onError; \
6402 } \
6403 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
6405 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 if (c != '\\') {
6407 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 continue;
6409 }
6410
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 if (s >= end) {
6414 message = "\\ at end of string";
6415 goto error;
6416 }
6417 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006420 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 case '\n': continue;
6424 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6425 case '\'': WRITE_ASCII_CHAR('\''); continue;
6426 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6427 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006428 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6430 case 't': WRITE_ASCII_CHAR('\t'); continue;
6431 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6432 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006433 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006435 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006436 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 case '0': case '1': case '2': case '3':
6440 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006442 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 ch = (ch<<3) + *s++ - '0';
6444 if (s < end && '0' <= *s && *s <= '7') {
6445 ch = (ch<<3) + *s++ - '0';
6446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 WRITE_CHAR(ch);
6449 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 /* hex escapes */
6452 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006455 message = "truncated \\xXX escape";
6456 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006461 message = "truncated \\uXXXX escape";
6462 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006465 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006467 message = "truncated \\UXXXXXXXX escape";
6468 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006470 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 ch <<= 4;
6472 if (c >= '0' && c <= '9') {
6473 ch += c - '0';
6474 }
6475 else if (c >= 'a' && c <= 'f') {
6476 ch += c - ('a' - 10);
6477 }
6478 else if (c >= 'A' && c <= 'F') {
6479 ch += c - ('A' - 10);
6480 }
6481 else {
6482 break;
6483 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006484 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006486 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 }
6488
6489 /* when we get here, ch is a 32-bit unicode character */
6490 if (ch > MAX_UNICODE) {
6491 message = "illegal Unicode character";
6492 goto error;
6493 }
6494
6495 WRITE_CHAR(ch);
6496 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006497
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006499 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006500 if (ucnhash_CAPI == NULL) {
6501 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006502 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6503 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 if (ucnhash_CAPI == NULL) {
6505 PyErr_SetString(
6506 PyExc_UnicodeError,
6507 "\\N escapes not supported (can't load unicodedata module)"
6508 );
6509 goto onError;
6510 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006511 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006512
6513 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006514 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 const char *start = ++s;
6516 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006517 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006518 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006519 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 namelen = s - start;
6521 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006522 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006523 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006524 ch = 0xffffffff; /* in case 'getcode' messes up */
6525 if (namelen <= INT_MAX &&
6526 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6527 &ch, 0)) {
6528 assert(ch <= MAX_UNICODE);
6529 WRITE_CHAR(ch);
6530 continue;
6531 }
6532 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006533 }
6534 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006535 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006536
6537 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006538 if (*first_invalid_escape == NULL) {
6539 *first_invalid_escape = s-1; /* Back up one char, since we've
6540 already incremented s. */
6541 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006542 WRITE_ASCII_CHAR('\\');
6543 WRITE_CHAR(c);
6544 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006546
6547 error:
6548 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006549 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006550 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006551 errors, &errorHandler,
6552 "unicodeescape", message,
6553 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006554 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006555 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006556 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006557 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006558
6559#undef WRITE_ASCII_CHAR
6560#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006562
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006563 Py_XDECREF(errorHandler);
6564 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006565 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006566
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006568 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 Py_XDECREF(errorHandler);
6570 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 return NULL;
6572}
6573
Eric V. Smith42454af2016-10-31 09:22:08 -04006574PyObject *
6575PyUnicode_DecodeUnicodeEscape(const char *s,
6576 Py_ssize_t size,
6577 const char *errors)
6578{
6579 const char *first_invalid_escape;
6580 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6581 &first_invalid_escape);
6582 if (result == NULL)
6583 return NULL;
6584 if (first_invalid_escape != NULL) {
6585 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6586 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006587 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006588 Py_DECREF(result);
6589 return NULL;
6590 }
6591 }
6592 return result;
6593}
6594
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006595/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006598PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006600 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006601 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006603 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006604 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006605 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606
Ezio Melottie7f90372012-10-05 03:33:31 +03006607 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006608 escape.
6609
Ezio Melottie7f90372012-10-05 03:33:31 +03006610 For UCS1 strings it's '\xxx', 4 bytes per source character.
6611 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6612 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006613 */
6614
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006615 if (!PyUnicode_Check(unicode)) {
6616 PyErr_BadArgument();
6617 return NULL;
6618 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006619 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006620 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006621 }
Victor Stinner358af132015-10-12 22:36:57 +02006622
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006623 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 if (len == 0) {
6625 return PyBytes_FromStringAndSize(NULL, 0);
6626 }
6627
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628 kind = PyUnicode_KIND(unicode);
6629 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6631 bytes, and 1 byte characters 4. */
6632 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006633 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006634 return PyErr_NoMemory();
6635 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006636 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006637 if (repr == NULL) {
6638 return NULL;
6639 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006640
Victor Stinner62ec3312016-09-06 17:04:34 -07006641 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006642 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006643 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006644
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 /* U+0000-U+00ff range */
6646 if (ch < 0x100) {
6647 if (ch >= ' ' && ch < 127) {
6648 if (ch != '\\') {
6649 /* Copy printable US ASCII as-is */
6650 *p++ = (char) ch;
6651 }
6652 /* Escape backslashes */
6653 else {
6654 *p++ = '\\';
6655 *p++ = '\\';
6656 }
6657 }
Victor Stinner358af132015-10-12 22:36:57 +02006658
Victor Stinner62ec3312016-09-06 17:04:34 -07006659 /* Map special whitespace to '\t', \n', '\r' */
6660 else if (ch == '\t') {
6661 *p++ = '\\';
6662 *p++ = 't';
6663 }
6664 else if (ch == '\n') {
6665 *p++ = '\\';
6666 *p++ = 'n';
6667 }
6668 else if (ch == '\r') {
6669 *p++ = '\\';
6670 *p++ = 'r';
6671 }
6672
6673 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6674 else {
6675 *p++ = '\\';
6676 *p++ = 'x';
6677 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6678 *p++ = Py_hexdigits[ch & 0x000F];
6679 }
Tim Petersced69f82003-09-16 20:30:58 +00006680 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006681 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006682 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 *p++ = '\\';
6684 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006685 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6686 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6687 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6688 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006690 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6691 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006692
Victor Stinner62ec3312016-09-06 17:04:34 -07006693 /* Make sure that the first two digits are zero */
6694 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006695 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006696 *p++ = 'U';
6697 *p++ = '0';
6698 *p++ = '0';
6699 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6700 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6701 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6702 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6703 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6704 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707
Victor Stinner62ec3312016-09-06 17:04:34 -07006708 assert(p - PyBytes_AS_STRING(repr) > 0);
6709 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6710 return NULL;
6711 }
6712 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713}
6714
Alexander Belopolsky40018472011-02-26 01:02:56 +00006715PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006716PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6717 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006719 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006720 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006721 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006723 }
6724
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006725 result = PyUnicode_AsUnicodeEscapeString(tmp);
6726 Py_DECREF(tmp);
6727 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
6730/* --- Raw Unicode Escape Codec ------------------------------------------- */
6731
Alexander Belopolsky40018472011-02-26 01:02:56 +00006732PyObject *
6733PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006734 Py_ssize_t size,
6735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 PyObject *errorHandler = NULL;
6741 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006742
Victor Stinner62ec3312016-09-06 17:04:34 -07006743 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006744 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006745 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 /* Escaped strings will always be longer than the resulting
6748 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006749 length after conversion to the true value. (But decoding error
6750 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006751 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006752 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006753 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6754 goto onError;
6755 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006756
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 end = s + size;
6758 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006759 unsigned char c = (unsigned char) *s++;
6760 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006761 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006762 Py_ssize_t startinpos;
6763 Py_ssize_t endinpos;
6764 const char *message;
6765
6766#define WRITE_CHAR(ch) \
6767 do { \
6768 if (ch <= writer.maxchar) { \
6769 assert(writer.pos < writer.size); \
6770 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6771 } \
6772 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6773 goto onError; \
6774 } \
6775 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006778 if (c != '\\' || s >= end) {
6779 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006781 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006782
Victor Stinner62ec3312016-09-06 17:04:34 -07006783 c = (unsigned char) *s++;
6784 if (c == 'u') {
6785 count = 4;
6786 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006788 else if (c == 'U') {
6789 count = 8;
6790 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006791 }
6792 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006793 assert(writer.pos < writer.size);
6794 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6795 WRITE_CHAR(c);
6796 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006797 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006798 startinpos = s - starts - 2;
6799
6800 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6801 for (ch = 0; count && s < end; ++s, --count) {
6802 c = (unsigned char)*s;
6803 ch <<= 4;
6804 if (c >= '0' && c <= '9') {
6805 ch += c - '0';
6806 }
6807 else if (c >= 'a' && c <= 'f') {
6808 ch += c - ('a' - 10);
6809 }
6810 else if (c >= 'A' && c <= 'F') {
6811 ch += c - ('A' - 10);
6812 }
6813 else {
6814 break;
6815 }
6816 }
6817 if (!count) {
6818 if (ch <= MAX_UNICODE) {
6819 WRITE_CHAR(ch);
6820 continue;
6821 }
6822 message = "\\Uxxxxxxxx out of range";
6823 }
6824
6825 endinpos = s-starts;
6826 writer.min_length = end - s + writer.pos;
6827 if (unicode_decode_call_errorhandler_writer(
6828 errors, &errorHandler,
6829 "rawunicodeescape", message,
6830 &starts, &end, &startinpos, &endinpos, &exc, &s,
6831 &writer)) {
6832 goto onError;
6833 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006834 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006835
6836#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838 Py_XDECREF(errorHandler);
6839 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006840 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006841
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006843 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 Py_XDECREF(errorHandler);
6845 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006847
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848}
6849
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006850
Alexander Belopolsky40018472011-02-26 01:02:56 +00006851PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006852PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853{
Victor Stinner62ec3312016-09-06 17:04:34 -07006854 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006856 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006857 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006858 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006859 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006861 if (!PyUnicode_Check(unicode)) {
6862 PyErr_BadArgument();
6863 return NULL;
6864 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006865 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006866 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006867 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006868 kind = PyUnicode_KIND(unicode);
6869 data = PyUnicode_DATA(unicode);
6870 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006871 if (kind == PyUnicode_1BYTE_KIND) {
6872 return PyBytes_FromStringAndSize(data, len);
6873 }
Victor Stinner0e368262011-11-10 20:12:49 +01006874
Victor Stinner62ec3312016-09-06 17:04:34 -07006875 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6876 bytes, and 1 byte characters 4. */
6877 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006878
Victor Stinner62ec3312016-09-06 17:04:34 -07006879 if (len > PY_SSIZE_T_MAX / expandsize) {
6880 return PyErr_NoMemory();
6881 }
6882 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6883 if (repr == NULL) {
6884 return NULL;
6885 }
6886 if (len == 0) {
6887 return repr;
6888 }
6889
6890 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006891 for (pos = 0; pos < len; pos++) {
6892 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006893
Victor Stinner62ec3312016-09-06 17:04:34 -07006894 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6895 if (ch < 0x100) {
6896 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006897 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006898 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006899 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 *p++ = '\\';
6901 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006902 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6903 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6904 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6905 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006907 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6908 else {
6909 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6910 *p++ = '\\';
6911 *p++ = 'U';
6912 *p++ = '0';
6913 *p++ = '0';
6914 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6915 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6916 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6917 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6918 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6919 *p++ = Py_hexdigits[ch & 15];
6920 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006922
Victor Stinner62ec3312016-09-06 17:04:34 -07006923 assert(p > PyBytes_AS_STRING(repr));
6924 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6925 return NULL;
6926 }
6927 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006931PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6932 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006934 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006935 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006936 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006937 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006938 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6939 Py_DECREF(tmp);
6940 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941}
6942
6943/* --- Latin-1 Codec ------------------------------------------------------ */
6944
Alexander Belopolsky40018472011-02-26 01:02:56 +00006945PyObject *
6946PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006947 Py_ssize_t size,
6948 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006951 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952}
6953
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006954/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006955static void
6956make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006957 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006958 PyObject *unicode,
6959 Py_ssize_t startpos, Py_ssize_t endpos,
6960 const char *reason)
6961{
6962 if (*exceptionObject == NULL) {
6963 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006964 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006965 encoding, unicode, startpos, endpos, reason);
6966 }
6967 else {
6968 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6969 goto onError;
6970 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6971 goto onError;
6972 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6973 goto onError;
6974 return;
6975 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006976 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006977 }
6978}
6979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006980/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006981static void
6982raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006983 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006984 PyObject *unicode,
6985 Py_ssize_t startpos, Py_ssize_t endpos,
6986 const char *reason)
6987{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006988 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006989 encoding, unicode, startpos, endpos, reason);
6990 if (*exceptionObject != NULL)
6991 PyCodec_StrictErrors(*exceptionObject);
6992}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006993
6994/* error handling callback helper:
6995 build arguments, call the callback and check the arguments,
6996 put the result into newpos and return the replacement string, which
6997 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006998static PyObject *
6999unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007000 PyObject **errorHandler,
7001 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007002 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007003 Py_ssize_t startpos, Py_ssize_t endpos,
7004 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007005{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007006 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007007 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007008 PyObject *restuple;
7009 PyObject *resunicode;
7010
7011 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007015 }
7016
Benjamin Petersonbac79492012-01-14 13:34:47 -05007017 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018 return NULL;
7019 len = PyUnicode_GET_LENGTH(unicode);
7020
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007021 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007023 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007025
Petr Viktorinffd97532020-02-11 17:46:57 +01007026 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007027 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007029 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007030 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 Py_DECREF(restuple);
7032 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007033 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007034 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 &resunicode, newpos)) {
7036 Py_DECREF(restuple);
7037 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007038 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007039 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7040 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7041 Py_DECREF(restuple);
7042 return NULL;
7043 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007045 *newpos = len + *newpos;
7046 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007047 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 Py_DECREF(restuple);
7049 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007050 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051 Py_INCREF(resunicode);
7052 Py_DECREF(restuple);
7053 return resunicode;
7054}
7055
Alexander Belopolsky40018472011-02-26 01:02:56 +00007056static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007058 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007059 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007061 /* input state */
7062 Py_ssize_t pos=0, size;
7063 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007064 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065 /* pointer into the output */
7066 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007067 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7068 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007069 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007070 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007071 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007072 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007073 /* output object */
7074 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007075
Benjamin Petersonbac79492012-01-14 13:34:47 -05007076 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007077 return NULL;
7078 size = PyUnicode_GET_LENGTH(unicode);
7079 kind = PyUnicode_KIND(unicode);
7080 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007081 /* allocate enough for a simple encoding without
7082 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007083 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007084 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007085
7086 _PyBytesWriter_Init(&writer);
7087 str = _PyBytesWriter_Alloc(&writer, size);
7088 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007089 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007091 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007092 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007093
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007095 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007097 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007098 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007099 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007101 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007103 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007104 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007106
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007107 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007109
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007110 /* Only overallocate the buffer if it's not the last write */
7111 writer.overallocate = (collend < size);
7112
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007114 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007115 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007116
7117 switch (error_handler) {
7118 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007119 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007121
7122 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007123 memset(str, '?', collend - collstart);
7124 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007125 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007126 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007127 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 break;
Victor Stinner50149202015-09-22 00:26:54 +02007129
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007130 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007131 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007132 writer.min_size -= (collend - collstart);
7133 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007134 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007135 if (str == NULL)
7136 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007137 pos = collend;
7138 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007139
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007140 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007141 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007142 writer.min_size -= (collend - collstart);
7143 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007144 unicode, collstart, collend);
7145 if (str == NULL)
7146 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007147 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 break;
Victor Stinner50149202015-09-22 00:26:54 +02007149
Victor Stinnerc3713e92015-09-29 12:32:13 +02007150 case _Py_ERROR_SURROGATEESCAPE:
7151 for (i = collstart; i < collend; ++i) {
7152 ch = PyUnicode_READ(kind, data, i);
7153 if (ch < 0xdc80 || 0xdcff < ch) {
7154 /* Not a UTF-8b surrogate */
7155 break;
7156 }
7157 *str++ = (char)(ch - 0xdc00);
7158 ++pos;
7159 }
7160 if (i >= collend)
7161 break;
7162 collstart = pos;
7163 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007164 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007165
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007167 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7168 encoding, reason, unicode, &exc,
7169 collstart, collend, &newpos);
7170 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007172
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007173 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007174 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007175
Victor Stinner6bd525b2015-10-09 13:10:05 +02007176 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007177 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007178 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007179 PyBytes_AS_STRING(rep),
7180 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007181 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007182 else {
7183 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007184
Victor Stinner6bd525b2015-10-09 13:10:05 +02007185 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007187
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007188 if (limit == 256 ?
7189 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7190 !PyUnicode_IS_ASCII(rep))
7191 {
7192 /* Not all characters are smaller than limit */
7193 raise_encode_exception(&exc, encoding, unicode,
7194 collstart, collend, reason);
7195 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007197 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7198 str = _PyBytesWriter_WriteBytes(&writer, str,
7199 PyUnicode_DATA(rep),
7200 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007202 if (str == NULL)
7203 goto onError;
7204
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007205 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007206 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007207 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007208
7209 /* If overallocation was disabled, ensure that it was the last
7210 write. Otherwise, we missed an optimization */
7211 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007212 }
7213 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007214
Victor Stinner50149202015-09-22 00:26:54 +02007215 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007217 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007218
7219 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007220 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007221 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007222 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007223 Py_XDECREF(exc);
7224 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007225}
7226
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007227/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007228PyObject *
7229PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007230 Py_ssize_t size,
7231 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007233 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007234 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007235 if (unicode == NULL)
7236 return NULL;
7237 result = unicode_encode_ucs1(unicode, errors, 256);
7238 Py_DECREF(unicode);
7239 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240}
7241
Alexander Belopolsky40018472011-02-26 01:02:56 +00007242PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007243_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244{
7245 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 PyErr_BadArgument();
7247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007249 if (PyUnicode_READY(unicode) == -1)
7250 return NULL;
7251 /* Fast path: if it is a one-byte string, construct
7252 bytes object directly. */
7253 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7254 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7255 PyUnicode_GET_LENGTH(unicode));
7256 /* Non-Latin-1 characters present. Defer to above function to
7257 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007258 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007259}
7260
7261PyObject*
7262PyUnicode_AsLatin1String(PyObject *unicode)
7263{
7264 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265}
7266
7267/* --- 7-bit ASCII Codec -------------------------------------------------- */
7268
Alexander Belopolsky40018472011-02-26 01:02:56 +00007269PyObject *
7270PyUnicode_DecodeASCII(const char *s,
7271 Py_ssize_t size,
7272 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007275 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007276 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007277 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007278 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007279
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007281 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007282
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007284 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007285 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007287
Inada Naoki770847a2019-06-24 12:30:24 +09007288 // Shortcut for simple case
7289 PyObject *u = PyUnicode_New(size, 127);
7290 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007291 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007292 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007293 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007294 if (outpos == size) {
7295 return u;
7296 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007297
Inada Naoki770847a2019-06-24 12:30:24 +09007298 _PyUnicodeWriter writer;
7299 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007300 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007301
Inada Naoki770847a2019-06-24 12:30:24 +09007302 s += outpos;
7303 int kind = writer.kind;
7304 void *data = writer.data;
7305 Py_ssize_t startinpos, endinpos;
7306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007308 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007310 PyUnicode_WRITE(kind, data, writer.pos, c);
7311 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007312 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007313 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007315
7316 /* byte outsize range 0x00..0x7f: call the error handler */
7317
7318 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007319 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007320
7321 switch (error_handler)
7322 {
7323 case _Py_ERROR_REPLACE:
7324 case _Py_ERROR_SURROGATEESCAPE:
7325 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007326 but we may switch to UCS2 at the first write */
7327 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7328 goto onError;
7329 kind = writer.kind;
7330 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007331
7332 if (error_handler == _Py_ERROR_REPLACE)
7333 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7334 else
7335 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7336 writer.pos++;
7337 ++s;
7338 break;
7339
7340 case _Py_ERROR_IGNORE:
7341 ++s;
7342 break;
7343
7344 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 startinpos = s-starts;
7346 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007347 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007348 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 "ascii", "ordinal not in range(128)",
7350 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007351 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007353 kind = writer.kind;
7354 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007357 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007358 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007359 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007360
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007362 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007363 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007364 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 return NULL;
7366}
7367
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007368/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007369PyObject *
7370PyUnicode_EncodeASCII(const Py_UNICODE *p,
7371 Py_ssize_t size,
7372 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007373{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007374 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007375 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007376 if (unicode == NULL)
7377 return NULL;
7378 result = unicode_encode_ucs1(unicode, errors, 128);
7379 Py_DECREF(unicode);
7380 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381}
7382
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007384_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385{
7386 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 PyErr_BadArgument();
7388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007390 if (PyUnicode_READY(unicode) == -1)
7391 return NULL;
7392 /* Fast path: if it is an ASCII-only string, construct bytes object
7393 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007394 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007395 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7396 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007397 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007398}
7399
7400PyObject *
7401PyUnicode_AsASCIIString(PyObject *unicode)
7402{
7403 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404}
7405
Steve Dowercc16be82016-09-08 10:35:16 -07007406#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007407
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007408/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007409
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007410#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411#define NEED_RETRY
7412#endif
7413
Steve Dower7ebdda02019-08-21 16:22:33 -07007414/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7415 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7416 both cases also and avoids partial characters overrunning the
7417 length limit in MultiByteToWideChar on Windows */
7418#define DECODING_CHUNK_SIZE (INT_MAX/4)
7419
Victor Stinner3a50e702011-10-18 21:21:00 +02007420#ifndef WC_ERR_INVALID_CHARS
7421# define WC_ERR_INVALID_CHARS 0x0080
7422#endif
7423
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007424static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007425code_page_name(UINT code_page, PyObject **obj)
7426{
7427 *obj = NULL;
7428 if (code_page == CP_ACP)
7429 return "mbcs";
7430 if (code_page == CP_UTF7)
7431 return "CP_UTF7";
7432 if (code_page == CP_UTF8)
7433 return "CP_UTF8";
7434
7435 *obj = PyBytes_FromFormat("cp%u", code_page);
7436 if (*obj == NULL)
7437 return NULL;
7438 return PyBytes_AS_STRING(*obj);
7439}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440
Victor Stinner3a50e702011-10-18 21:21:00 +02007441static DWORD
7442decode_code_page_flags(UINT code_page)
7443{
7444 if (code_page == CP_UTF7) {
7445 /* The CP_UTF7 decoder only supports flags=0 */
7446 return 0;
7447 }
7448 else
7449 return MB_ERR_INVALID_CHARS;
7450}
7451
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 * Decode a byte string from a Windows code page into unicode object in strict
7454 * mode.
7455 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007456 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7457 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007459static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007460decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007461 wchar_t **buf,
7462 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 const char *in,
7464 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007466 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007467 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469
7470 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007472 while ((outsize = MultiByteToWideChar(code_page, flags,
7473 in, insize, NULL, 0)) <= 0)
7474 {
7475 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7476 goto error;
7477 }
7478 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7479 flags = 0;
7480 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007482 /* Extend a wchar_t* buffer */
7483 Py_ssize_t n = *bufsize; /* Get the current length */
7484 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7485 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007487 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488
7489 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7491 if (outsize <= 0)
7492 goto error;
7493 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495error:
7496 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7497 return -2;
7498 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007499 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500}
7501
Victor Stinner3a50e702011-10-18 21:21:00 +02007502/*
7503 * Decode a byte string from a code page into unicode object with an error
7504 * handler.
7505 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007506 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 * UnicodeDecodeError exception and returns -1 on error.
7508 */
7509static int
7510decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007511 wchar_t **buf,
7512 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007513 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007514 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007515{
7516 const char *startin = in;
7517 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007518 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007519 /* Ideally, we should get reason from FormatMessage. This is the Windows
7520 2000 English version of the message. */
7521 const char *reason = "No mapping for the Unicode character exists "
7522 "in the target code page.";
7523 /* each step cannot decode more than 1 character, but a character can be
7524 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007525 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007526 int insize;
7527 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 PyObject *errorHandler = NULL;
7529 PyObject *exc = NULL;
7530 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007531 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 DWORD err;
7533 int ret = -1;
7534
7535 assert(size > 0);
7536
7537 encoding = code_page_name(code_page, &encoding_obj);
7538 if (encoding == NULL)
7539 return -1;
7540
Victor Stinner7d00cc12014-03-17 23:08:06 +01007541 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7543 UnicodeDecodeError. */
7544 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7545 if (exc != NULL) {
7546 PyCodec_StrictErrors(exc);
7547 Py_CLEAR(exc);
7548 }
7549 goto error;
7550 }
7551
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007552 /* Extend a wchar_t* buffer */
7553 Py_ssize_t n = *bufsize; /* Get the current length */
7554 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7555 PyErr_NoMemory();
7556 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007558 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7559 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007561 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007562
7563 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 while (in < endin)
7565 {
7566 /* Decode a character */
7567 insize = 1;
7568 do
7569 {
7570 outsize = MultiByteToWideChar(code_page, flags,
7571 in, insize,
7572 buffer, Py_ARRAY_LENGTH(buffer));
7573 if (outsize > 0)
7574 break;
7575 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007576 if (err == ERROR_INVALID_FLAGS && flags) {
7577 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7578 flags = 0;
7579 continue;
7580 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 if (err != ERROR_NO_UNICODE_TRANSLATION
7582 && err != ERROR_INSUFFICIENT_BUFFER)
7583 {
7584 PyErr_SetFromWindowsErr(0);
7585 goto error;
7586 }
7587 insize++;
7588 }
7589 /* 4=maximum length of a UTF-8 sequence */
7590 while (insize <= 4 && (in + insize) <= endin);
7591
7592 if (outsize <= 0) {
7593 Py_ssize_t startinpos, endinpos, outpos;
7594
Victor Stinner7d00cc12014-03-17 23:08:06 +01007595 /* last character in partial decode? */
7596 if (in + insize >= endin && !final)
7597 break;
7598
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 startinpos = in - startin;
7600 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007601 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007602 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 errors, &errorHandler,
7604 encoding, reason,
7605 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007606 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 {
7608 goto error;
7609 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007610 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 }
7612 else {
7613 in += insize;
7614 memcpy(out, buffer, outsize * sizeof(wchar_t));
7615 out += outsize;
7616 }
7617 }
7618
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007619 /* Shrink the buffer */
7620 assert(out - *buf <= *bufsize);
7621 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007622 /* (in - startin) <= size and size is an int */
7623 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007624
7625error:
7626 Py_XDECREF(encoding_obj);
7627 Py_XDECREF(errorHandler);
7628 Py_XDECREF(exc);
7629 return ret;
7630}
7631
Victor Stinner3a50e702011-10-18 21:21:00 +02007632static PyObject *
7633decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007634 const char *s, Py_ssize_t size,
7635 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007637 wchar_t *buf = NULL;
7638 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007639 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007640
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 if (code_page < 0) {
7642 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7643 return NULL;
7644 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007645 if (size < 0) {
7646 PyErr_BadInternalCall();
7647 return NULL;
7648 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007649
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007650 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007652
Victor Stinner76a31a62011-11-04 00:05:13 +01007653 do
7654 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007655#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007656 if (size > DECODING_CHUNK_SIZE) {
7657 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007658 final = 0;
7659 done = 0;
7660 }
7661 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007662#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007663 {
7664 chunk_size = (int)size;
7665 final = (consumed == NULL);
7666 done = 1;
7667 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007668
Victor Stinner76a31a62011-11-04 00:05:13 +01007669 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007670 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007671 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007672 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007673 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007674
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007675 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007676 s, chunk_size);
7677 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007678 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007679 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007680 errors, final);
7681 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007682
7683 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007684 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007685 return NULL;
7686 }
7687
7688 if (consumed)
7689 *consumed += converted;
7690
7691 s += converted;
7692 size -= converted;
7693 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007694
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007695 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7696 PyMem_Free(buf);
7697 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007698}
7699
Alexander Belopolsky40018472011-02-26 01:02:56 +00007700PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007701PyUnicode_DecodeCodePageStateful(int code_page,
7702 const char *s,
7703 Py_ssize_t size,
7704 const char *errors,
7705 Py_ssize_t *consumed)
7706{
7707 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7708}
7709
7710PyObject *
7711PyUnicode_DecodeMBCSStateful(const char *s,
7712 Py_ssize_t size,
7713 const char *errors,
7714 Py_ssize_t *consumed)
7715{
7716 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7717}
7718
7719PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007720PyUnicode_DecodeMBCS(const char *s,
7721 Py_ssize_t size,
7722 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007723{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007724 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7725}
7726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727static DWORD
7728encode_code_page_flags(UINT code_page, const char *errors)
7729{
7730 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007731 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 }
7733 else if (code_page == CP_UTF7) {
7734 /* CP_UTF7 only supports flags=0 */
7735 return 0;
7736 }
7737 else {
7738 if (errors != NULL && strcmp(errors, "replace") == 0)
7739 return 0;
7740 else
7741 return WC_NO_BEST_FIT_CHARS;
7742 }
7743}
7744
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007745/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 * Encode a Unicode string to a Windows code page into a byte string in strict
7747 * mode.
7748 *
7749 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007750 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007751 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007752static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007753encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007754 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007755 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007756{
Victor Stinner554f3f02010-06-16 23:33:54 +00007757 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007758 BOOL *pusedDefaultChar = &usedDefaultChar;
7759 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007760 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007761 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 const DWORD flags = encode_code_page_flags(code_page, NULL);
7763 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007764 /* Create a substring so that we can get the UTF-16 representation
7765 of just the slice under consideration. */
7766 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007767 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007768
Martin v. Löwis3d325192011-11-04 18:23:06 +01007769 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007770
Victor Stinner3a50e702011-10-18 21:21:00 +02007771 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007772 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007773 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007774 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007775
Victor Stinner2fc507f2011-11-04 20:06:39 +01007776 substring = PyUnicode_Substring(unicode, offset, offset+len);
7777 if (substring == NULL)
7778 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007779#if USE_UNICODE_WCHAR_CACHE
7780_Py_COMP_DIAG_PUSH
7781_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007782 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7783 if (p == NULL) {
7784 Py_DECREF(substring);
7785 return -1;
7786 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007787_Py_COMP_DIAG_POP
7788#else /* USE_UNICODE_WCHAR_CACHE */
7789 p = PyUnicode_AsWideCharString(substring, &size);
7790 Py_CLEAR(substring);
7791 if (p == NULL) {
7792 return -1;
7793 }
7794#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007795 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007796
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007797 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007798 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007799 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007800 NULL, 0,
7801 NULL, pusedDefaultChar);
7802 if (outsize <= 0)
7803 goto error;
7804 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007805 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007806 ret = -2;
7807 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007808 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007809
Victor Stinner3a50e702011-10-18 21:21:00 +02007810 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007812 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007813 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007814 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007815 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007816 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007817 }
7818 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007820 const Py_ssize_t n = PyBytes_Size(*outbytes);
7821 if (outsize > PY_SSIZE_T_MAX - n) {
7822 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007823 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007824 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007825 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007826 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007827 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007828 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007829 }
7830
7831 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007832 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007833 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007834 out, outsize,
7835 NULL, pusedDefaultChar);
7836 if (outsize <= 0)
7837 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007838 if (pusedDefaultChar && *pusedDefaultChar) {
7839 ret = -2;
7840 goto done;
7841 }
7842 ret = 0;
7843
7844done:
7845#if USE_UNICODE_WCHAR_CACHE
7846 Py_DECREF(substring);
7847#else /* USE_UNICODE_WCHAR_CACHE */
7848 PyMem_Free(p);
7849#endif /* USE_UNICODE_WCHAR_CACHE */
7850 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007851
Victor Stinner3a50e702011-10-18 21:21:00 +02007852error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007853 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7854 ret = -2;
7855 goto done;
7856 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007857 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007858 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007859}
7860
Victor Stinner3a50e702011-10-18 21:21:00 +02007861/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007862 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007863 * error handler.
7864 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007865 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007866 * -1 on other error.
7867 */
7868static int
7869encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007870 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007871 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007872{
Victor Stinner3a50e702011-10-18 21:21:00 +02007873 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007874 Py_ssize_t pos = unicode_offset;
7875 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007876 /* Ideally, we should get reason from FormatMessage. This is the Windows
7877 2000 English version of the message. */
7878 const char *reason = "invalid character";
7879 /* 4=maximum length of a UTF-8 sequence */
7880 char buffer[4];
7881 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7882 Py_ssize_t outsize;
7883 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007884 PyObject *errorHandler = NULL;
7885 PyObject *exc = NULL;
7886 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007887 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007888 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007889 PyObject *rep;
7890 int ret = -1;
7891
7892 assert(insize > 0);
7893
7894 encoding = code_page_name(code_page, &encoding_obj);
7895 if (encoding == NULL)
7896 return -1;
7897
7898 if (errors == NULL || strcmp(errors, "strict") == 0) {
7899 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7900 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007901 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007902 if (exc != NULL) {
7903 PyCodec_StrictErrors(exc);
7904 Py_DECREF(exc);
7905 }
7906 Py_XDECREF(encoding_obj);
7907 return -1;
7908 }
7909
7910 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7911 pusedDefaultChar = &usedDefaultChar;
7912 else
7913 pusedDefaultChar = NULL;
7914
7915 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7916 PyErr_NoMemory();
7917 goto error;
7918 }
7919 outsize = insize * Py_ARRAY_LENGTH(buffer);
7920
7921 if (*outbytes == NULL) {
7922 /* Create string object */
7923 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7924 if (*outbytes == NULL)
7925 goto error;
7926 out = PyBytes_AS_STRING(*outbytes);
7927 }
7928 else {
7929 /* Extend string object */
7930 Py_ssize_t n = PyBytes_Size(*outbytes);
7931 if (n > PY_SSIZE_T_MAX - outsize) {
7932 PyErr_NoMemory();
7933 goto error;
7934 }
7935 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7936 goto error;
7937 out = PyBytes_AS_STRING(*outbytes) + n;
7938 }
7939
7940 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007941 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007942 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007943 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7944 wchar_t chars[2];
7945 int charsize;
7946 if (ch < 0x10000) {
7947 chars[0] = (wchar_t)ch;
7948 charsize = 1;
7949 }
7950 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007951 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7952 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007953 charsize = 2;
7954 }
7955
Victor Stinner3a50e702011-10-18 21:21:00 +02007956 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007957 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007958 buffer, Py_ARRAY_LENGTH(buffer),
7959 NULL, pusedDefaultChar);
7960 if (outsize > 0) {
7961 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7962 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007963 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007964 memcpy(out, buffer, outsize);
7965 out += outsize;
7966 continue;
7967 }
7968 }
7969 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7970 PyErr_SetFromWindowsErr(0);
7971 goto error;
7972 }
7973
Victor Stinner3a50e702011-10-18 21:21:00 +02007974 rep = unicode_encode_call_errorhandler(
7975 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007976 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007977 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007978 if (rep == NULL)
7979 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007980 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007981
7982 if (PyBytes_Check(rep)) {
7983 outsize = PyBytes_GET_SIZE(rep);
7984 if (outsize != 1) {
7985 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7986 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7987 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7988 Py_DECREF(rep);
7989 goto error;
7990 }
7991 out = PyBytes_AS_STRING(*outbytes) + offset;
7992 }
7993 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7994 out += outsize;
7995 }
7996 else {
7997 Py_ssize_t i;
7998 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007999 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008000
Benjamin Petersonbac79492012-01-14 13:34:47 -05008001 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008002 Py_DECREF(rep);
8003 goto error;
8004 }
8005
8006 outsize = PyUnicode_GET_LENGTH(rep);
8007 if (outsize != 1) {
8008 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8009 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8010 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8011 Py_DECREF(rep);
8012 goto error;
8013 }
8014 out = PyBytes_AS_STRING(*outbytes) + offset;
8015 }
8016 kind = PyUnicode_KIND(rep);
8017 data = PyUnicode_DATA(rep);
8018 for (i=0; i < outsize; i++) {
8019 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8020 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008021 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008022 encoding, unicode,
8023 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008024 "unable to encode error handler result to ASCII");
8025 Py_DECREF(rep);
8026 goto error;
8027 }
8028 *out = (unsigned char)ch;
8029 out++;
8030 }
8031 }
8032 Py_DECREF(rep);
8033 }
8034 /* write a NUL byte */
8035 *out = 0;
8036 outsize = out - PyBytes_AS_STRING(*outbytes);
8037 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8038 if (_PyBytes_Resize(outbytes, outsize) < 0)
8039 goto error;
8040 ret = 0;
8041
8042error:
8043 Py_XDECREF(encoding_obj);
8044 Py_XDECREF(errorHandler);
8045 Py_XDECREF(exc);
8046 return ret;
8047}
8048
Victor Stinner3a50e702011-10-18 21:21:00 +02008049static PyObject *
8050encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008051 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008052 const char *errors)
8053{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008054 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008055 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008056 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008057 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008058
Victor Stinner29dacf22015-01-26 16:41:32 +01008059 if (!PyUnicode_Check(unicode)) {
8060 PyErr_BadArgument();
8061 return NULL;
8062 }
8063
Benjamin Petersonbac79492012-01-14 13:34:47 -05008064 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008065 return NULL;
8066 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008067
Victor Stinner3a50e702011-10-18 21:21:00 +02008068 if (code_page < 0) {
8069 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8070 return NULL;
8071 }
8072
Martin v. Löwis3d325192011-11-04 18:23:06 +01008073 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008074 return PyBytes_FromStringAndSize(NULL, 0);
8075
Victor Stinner7581cef2011-11-03 22:32:33 +01008076 offset = 0;
8077 do
8078 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008079#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008080 if (len > DECODING_CHUNK_SIZE) {
8081 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008082 done = 0;
8083 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008084 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008085#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008086 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008087 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008088 done = 1;
8089 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008090
Victor Stinner76a31a62011-11-04 00:05:13 +01008091 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008092 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008093 errors);
8094 if (ret == -2)
8095 ret = encode_code_page_errors(code_page, &outbytes,
8096 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008097 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008098 if (ret < 0) {
8099 Py_XDECREF(outbytes);
8100 return NULL;
8101 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008102
Victor Stinner7581cef2011-11-03 22:32:33 +01008103 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008104 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008105 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008106
Victor Stinner3a50e702011-10-18 21:21:00 +02008107 return outbytes;
8108}
8109
8110PyObject *
8111PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8112 Py_ssize_t size,
8113 const char *errors)
8114{
Victor Stinner7581cef2011-11-03 22:32:33 +01008115 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008116 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008117 if (unicode == NULL)
8118 return NULL;
8119 res = encode_code_page(CP_ACP, unicode, errors);
8120 Py_DECREF(unicode);
8121 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008122}
8123
8124PyObject *
8125PyUnicode_EncodeCodePage(int code_page,
8126 PyObject *unicode,
8127 const char *errors)
8128{
Victor Stinner7581cef2011-11-03 22:32:33 +01008129 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008130}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008131
Alexander Belopolsky40018472011-02-26 01:02:56 +00008132PyObject *
8133PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008134{
Victor Stinner7581cef2011-11-03 22:32:33 +01008135 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008136}
8137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008138#undef NEED_RETRY
8139
Steve Dowercc16be82016-09-08 10:35:16 -07008140#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008141
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142/* --- Character Mapping Codec -------------------------------------------- */
8143
Victor Stinnerfb161b12013-04-18 01:44:27 +02008144static int
8145charmap_decode_string(const char *s,
8146 Py_ssize_t size,
8147 PyObject *mapping,
8148 const char *errors,
8149 _PyUnicodeWriter *writer)
8150{
8151 const char *starts = s;
8152 const char *e;
8153 Py_ssize_t startinpos, endinpos;
8154 PyObject *errorHandler = NULL, *exc = NULL;
8155 Py_ssize_t maplen;
8156 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008157 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008158 Py_UCS4 x;
8159 unsigned char ch;
8160
8161 if (PyUnicode_READY(mapping) == -1)
8162 return -1;
8163
8164 maplen = PyUnicode_GET_LENGTH(mapping);
8165 mapdata = PyUnicode_DATA(mapping);
8166 mapkind = PyUnicode_KIND(mapping);
8167
8168 e = s + size;
8169
8170 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8171 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8172 * is disabled in encoding aliases, latin1 is preferred because
8173 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008174 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008175 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8176 Py_UCS4 maxchar = writer->maxchar;
8177
8178 assert (writer->kind == PyUnicode_1BYTE_KIND);
8179 while (s < e) {
8180 ch = *s;
8181 x = mapdata_ucs1[ch];
8182 if (x > maxchar) {
8183 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8184 goto onError;
8185 maxchar = writer->maxchar;
8186 outdata = (Py_UCS1 *)writer->data;
8187 }
8188 outdata[writer->pos] = x;
8189 writer->pos++;
8190 ++s;
8191 }
8192 return 0;
8193 }
8194
8195 while (s < e) {
8196 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8197 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008198 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008199 if (outkind == PyUnicode_1BYTE_KIND) {
8200 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8201 Py_UCS4 maxchar = writer->maxchar;
8202 while (s < e) {
8203 ch = *s;
8204 x = mapdata_ucs2[ch];
8205 if (x > maxchar)
8206 goto Error;
8207 outdata[writer->pos] = x;
8208 writer->pos++;
8209 ++s;
8210 }
8211 break;
8212 }
8213 else if (outkind == PyUnicode_2BYTE_KIND) {
8214 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8215 while (s < e) {
8216 ch = *s;
8217 x = mapdata_ucs2[ch];
8218 if (x == 0xFFFE)
8219 goto Error;
8220 outdata[writer->pos] = x;
8221 writer->pos++;
8222 ++s;
8223 }
8224 break;
8225 }
8226 }
8227 ch = *s;
8228
8229 if (ch < maplen)
8230 x = PyUnicode_READ(mapkind, mapdata, ch);
8231 else
8232 x = 0xfffe; /* invalid value */
8233Error:
8234 if (x == 0xfffe)
8235 {
8236 /* undefined mapping */
8237 startinpos = s-starts;
8238 endinpos = startinpos+1;
8239 if (unicode_decode_call_errorhandler_writer(
8240 errors, &errorHandler,
8241 "charmap", "character maps to <undefined>",
8242 &starts, &e, &startinpos, &endinpos, &exc, &s,
8243 writer)) {
8244 goto onError;
8245 }
8246 continue;
8247 }
8248
8249 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8250 goto onError;
8251 ++s;
8252 }
8253 Py_XDECREF(errorHandler);
8254 Py_XDECREF(exc);
8255 return 0;
8256
8257onError:
8258 Py_XDECREF(errorHandler);
8259 Py_XDECREF(exc);
8260 return -1;
8261}
8262
8263static int
8264charmap_decode_mapping(const char *s,
8265 Py_ssize_t size,
8266 PyObject *mapping,
8267 const char *errors,
8268 _PyUnicodeWriter *writer)
8269{
8270 const char *starts = s;
8271 const char *e;
8272 Py_ssize_t startinpos, endinpos;
8273 PyObject *errorHandler = NULL, *exc = NULL;
8274 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008275 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008276
8277 e = s + size;
8278
8279 while (s < e) {
8280 ch = *s;
8281
8282 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8283 key = PyLong_FromLong((long)ch);
8284 if (key == NULL)
8285 goto onError;
8286
8287 item = PyObject_GetItem(mapping, key);
8288 Py_DECREF(key);
8289 if (item == NULL) {
8290 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8291 /* No mapping found means: mapping is undefined. */
8292 PyErr_Clear();
8293 goto Undefined;
8294 } else
8295 goto onError;
8296 }
8297
8298 /* Apply mapping */
8299 if (item == Py_None)
8300 goto Undefined;
8301 if (PyLong_Check(item)) {
8302 long value = PyLong_AS_LONG(item);
8303 if (value == 0xFFFE)
8304 goto Undefined;
8305 if (value < 0 || value > MAX_UNICODE) {
8306 PyErr_Format(PyExc_TypeError,
8307 "character mapping must be in range(0x%lx)",
8308 (unsigned long)MAX_UNICODE + 1);
8309 goto onError;
8310 }
8311
8312 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8313 goto onError;
8314 }
8315 else if (PyUnicode_Check(item)) {
8316 if (PyUnicode_READY(item) == -1)
8317 goto onError;
8318 if (PyUnicode_GET_LENGTH(item) == 1) {
8319 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8320 if (value == 0xFFFE)
8321 goto Undefined;
8322 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8323 goto onError;
8324 }
8325 else {
8326 writer->overallocate = 1;
8327 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8328 goto onError;
8329 }
8330 }
8331 else {
8332 /* wrong return value */
8333 PyErr_SetString(PyExc_TypeError,
8334 "character mapping must return integer, None or str");
8335 goto onError;
8336 }
8337 Py_CLEAR(item);
8338 ++s;
8339 continue;
8340
8341Undefined:
8342 /* undefined mapping */
8343 Py_CLEAR(item);
8344 startinpos = s-starts;
8345 endinpos = startinpos+1;
8346 if (unicode_decode_call_errorhandler_writer(
8347 errors, &errorHandler,
8348 "charmap", "character maps to <undefined>",
8349 &starts, &e, &startinpos, &endinpos, &exc, &s,
8350 writer)) {
8351 goto onError;
8352 }
8353 }
8354 Py_XDECREF(errorHandler);
8355 Py_XDECREF(exc);
8356 return 0;
8357
8358onError:
8359 Py_XDECREF(item);
8360 Py_XDECREF(errorHandler);
8361 Py_XDECREF(exc);
8362 return -1;
8363}
8364
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365PyObject *
8366PyUnicode_DecodeCharmap(const char *s,
8367 Py_ssize_t size,
8368 PyObject *mapping,
8369 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008371 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008372
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 /* Default to Latin-1 */
8374 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008378 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008379 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008380 writer.min_length = size;
8381 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008383
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008384 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008385 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8386 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008387 }
8388 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008389 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8390 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008392 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008393
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008395 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 return NULL;
8397}
8398
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008399/* Charmap encoding: the lookup table */
8400
Alexander Belopolsky40018472011-02-26 01:02:56 +00008401struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 PyObject_HEAD
8403 unsigned char level1[32];
8404 int count2, count3;
8405 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008406};
8407
8408static PyObject*
8409encoding_map_size(PyObject *obj, PyObject* args)
8410{
8411 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008412 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414}
8415
8416static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008417 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 PyDoc_STR("Return the size (in bytes) of this object") },
8419 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008420};
8421
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008422static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008423 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 "EncodingMap", /*tp_name*/
8425 sizeof(struct encoding_map), /*tp_basicsize*/
8426 0, /*tp_itemsize*/
8427 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008428 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008429 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 0, /*tp_getattr*/
8431 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008432 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 0, /*tp_repr*/
8434 0, /*tp_as_number*/
8435 0, /*tp_as_sequence*/
8436 0, /*tp_as_mapping*/
8437 0, /*tp_hash*/
8438 0, /*tp_call*/
8439 0, /*tp_str*/
8440 0, /*tp_getattro*/
8441 0, /*tp_setattro*/
8442 0, /*tp_as_buffer*/
8443 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8444 0, /*tp_doc*/
8445 0, /*tp_traverse*/
8446 0, /*tp_clear*/
8447 0, /*tp_richcompare*/
8448 0, /*tp_weaklistoffset*/
8449 0, /*tp_iter*/
8450 0, /*tp_iternext*/
8451 encoding_map_methods, /*tp_methods*/
8452 0, /*tp_members*/
8453 0, /*tp_getset*/
8454 0, /*tp_base*/
8455 0, /*tp_dict*/
8456 0, /*tp_descr_get*/
8457 0, /*tp_descr_set*/
8458 0, /*tp_dictoffset*/
8459 0, /*tp_init*/
8460 0, /*tp_alloc*/
8461 0, /*tp_new*/
8462 0, /*tp_free*/
8463 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008464};
8465
8466PyObject*
8467PyUnicode_BuildEncodingMap(PyObject* string)
8468{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008469 PyObject *result;
8470 struct encoding_map *mresult;
8471 int i;
8472 int need_dict = 0;
8473 unsigned char level1[32];
8474 unsigned char level2[512];
8475 unsigned char *mlevel1, *mlevel2, *mlevel3;
8476 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008478 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008479 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008481
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008482 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008483 PyErr_BadArgument();
8484 return NULL;
8485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 kind = PyUnicode_KIND(string);
8487 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008488 length = PyUnicode_GET_LENGTH(string);
8489 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008490 memset(level1, 0xFF, sizeof level1);
8491 memset(level2, 0xFF, sizeof level2);
8492
8493 /* If there isn't a one-to-one mapping of NULL to \0,
8494 or if there are non-BMP characters, we need to use
8495 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008497 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008498 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008499 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 ch = PyUnicode_READ(kind, data, i);
8501 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008502 need_dict = 1;
8503 break;
8504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008506 /* unmapped character */
8507 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 l1 = ch >> 11;
8509 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008510 if (level1[l1] == 0xFF)
8511 level1[l1] = count2++;
8512 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008514 }
8515
8516 if (count2 >= 0xFF || count3 >= 0xFF)
8517 need_dict = 1;
8518
8519 if (need_dict) {
8520 PyObject *result = PyDict_New();
8521 PyObject *key, *value;
8522 if (!result)
8523 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008524 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008526 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527 if (!key || !value)
8528 goto failed1;
8529 if (PyDict_SetItem(result, key, value) == -1)
8530 goto failed1;
8531 Py_DECREF(key);
8532 Py_DECREF(value);
8533 }
8534 return result;
8535 failed1:
8536 Py_XDECREF(key);
8537 Py_XDECREF(value);
8538 Py_DECREF(result);
8539 return NULL;
8540 }
8541
8542 /* Create a three-level trie */
8543 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8544 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008545 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008546 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008547 }
8548
8549 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008550 mresult = (struct encoding_map*)result;
8551 mresult->count2 = count2;
8552 mresult->count3 = count3;
8553 mlevel1 = mresult->level1;
8554 mlevel2 = mresult->level23;
8555 mlevel3 = mresult->level23 + 16*count2;
8556 memcpy(mlevel1, level1, 32);
8557 memset(mlevel2, 0xFF, 16*count2);
8558 memset(mlevel3, 0, 128*count3);
8559 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008560 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008561 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008562 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8563 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008564 /* unmapped character */
8565 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008566 o1 = ch>>11;
8567 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008568 i2 = 16*mlevel1[o1] + o2;
8569 if (mlevel2[i2] == 0xFF)
8570 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008571 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008572 i3 = 128*mlevel2[i2] + o3;
8573 mlevel3[i3] = i;
8574 }
8575 return result;
8576}
8577
8578static int
Victor Stinner22168992011-11-20 17:09:18 +01008579encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008580{
8581 struct encoding_map *map = (struct encoding_map*)mapping;
8582 int l1 = c>>11;
8583 int l2 = (c>>7) & 0xF;
8584 int l3 = c & 0x7F;
8585 int i;
8586
Victor Stinner22168992011-11-20 17:09:18 +01008587 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008589 if (c == 0)
8590 return 0;
8591 /* level 1*/
8592 i = map->level1[l1];
8593 if (i == 0xFF) {
8594 return -1;
8595 }
8596 /* level 2*/
8597 i = map->level23[16*i+l2];
8598 if (i == 0xFF) {
8599 return -1;
8600 }
8601 /* level 3 */
8602 i = map->level23[16*map->count2 + 128*i + l3];
8603 if (i == 0) {
8604 return -1;
8605 }
8606 return i;
8607}
8608
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008609/* Lookup the character ch in the mapping. If the character
8610 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008611 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008612static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008613charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
Christian Heimes217cfd12007-12-02 14:31:20 +00008615 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 PyObject *x;
8617
8618 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 x = PyObject_GetItem(mapping, w);
8621 Py_DECREF(w);
8622 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8624 /* No mapping found means: mapping is undefined. */
8625 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008626 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 } else
8628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008630 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008632 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 long value = PyLong_AS_LONG(x);
8634 if (value < 0 || value > 255) {
8635 PyErr_SetString(PyExc_TypeError,
8636 "character mapping must be in range(256)");
8637 Py_DECREF(x);
8638 return NULL;
8639 }
8640 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008642 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 /* wrong return value */
8646 PyErr_Format(PyExc_TypeError,
8647 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008648 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 Py_DECREF(x);
8650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
8652}
8653
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008654static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008655charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008656{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008657 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8658 /* exponentially overallocate to minimize reallocations */
8659 if (requiredsize < 2*outsize)
8660 requiredsize = 2*outsize;
8661 if (_PyBytes_Resize(outobj, requiredsize))
8662 return -1;
8663 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008664}
8665
Benjamin Peterson14339b62009-01-31 16:36:08 +00008666typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008668} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008670 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 space is available. Return a new reference to the object that
8672 was put in the output buffer, or Py_None, if the mapping was undefined
8673 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008674 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008675static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008676charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008677 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008678{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008679 PyObject *rep;
8680 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008681 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682
Andy Lesterdffe4c02020-03-04 07:15:20 -06008683 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008684 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008686 if (res == -1)
8687 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 if (outsize<requiredsize)
8689 if (charmapencode_resize(outobj, outpos, requiredsize))
8690 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008691 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 outstart[(*outpos)++] = (char)res;
8693 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008694 }
8695
8696 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008699 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 Py_DECREF(rep);
8701 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008702 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 if (PyLong_Check(rep)) {
8704 Py_ssize_t requiredsize = *outpos+1;
8705 if (outsize<requiredsize)
8706 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8707 Py_DECREF(rep);
8708 return enc_EXCEPTION;
8709 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008710 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 else {
8714 const char *repchars = PyBytes_AS_STRING(rep);
8715 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8716 Py_ssize_t requiredsize = *outpos+repsize;
8717 if (outsize<requiredsize)
8718 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8719 Py_DECREF(rep);
8720 return enc_EXCEPTION;
8721 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008722 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 memcpy(outstart + *outpos, repchars, repsize);
8724 *outpos += repsize;
8725 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008726 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008727 Py_DECREF(rep);
8728 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729}
8730
8731/* handle an error in PyUnicode_EncodeCharmap
8732 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008733static int
8734charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008735 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008737 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008738 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739{
8740 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008741 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008742 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008743 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008744 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008745 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008747 Py_ssize_t collstartpos = *inpos;
8748 Py_ssize_t collendpos = *inpos+1;
8749 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008750 const char *encoding = "charmap";
8751 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008752 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008753 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008754 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755
Benjamin Petersonbac79492012-01-14 13:34:47 -05008756 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008757 return -1;
8758 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759 /* find all unencodable characters */
8760 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008761 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008762 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008763 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008764 val = encoding_map_lookup(ch, mapping);
8765 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 break;
8767 ++collendpos;
8768 continue;
8769 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8772 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 if (rep==NULL)
8774 return -1;
8775 else if (rep!=Py_None) {
8776 Py_DECREF(rep);
8777 break;
8778 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008779 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 }
8782 /* cache callback name lookup
8783 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008784 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008785 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008786
8787 switch (*error_handler) {
8788 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008789 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008790 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008791
8792 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008793 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 x = charmapencode_output('?', mapping, res, respos);
8795 if (x==enc_EXCEPTION) {
8796 return -1;
8797 }
8798 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008799 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 return -1;
8801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008802 }
8803 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008804 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008805 *inpos = collendpos;
8806 break;
Victor Stinner50149202015-09-22 00:26:54 +02008807
8808 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008809 /* generate replacement (temporarily (mis)uses p) */
8810 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 char buffer[2+29+1+1];
8812 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008813 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 for (cp = buffer; *cp; ++cp) {
8815 x = charmapencode_output(*cp, mapping, res, respos);
8816 if (x==enc_EXCEPTION)
8817 return -1;
8818 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008819 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 return -1;
8821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008822 }
8823 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008824 *inpos = collendpos;
8825 break;
Victor Stinner50149202015-09-22 00:26:54 +02008826
Benjamin Peterson14339b62009-01-31 16:36:08 +00008827 default:
Victor Stinner50149202015-09-22 00:26:54 +02008828 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008829 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008831 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008833 if (PyBytes_Check(repunicode)) {
8834 /* Directly copy bytes result to output. */
8835 Py_ssize_t outsize = PyBytes_Size(*res);
8836 Py_ssize_t requiredsize;
8837 repsize = PyBytes_Size(repunicode);
8838 requiredsize = *respos + repsize;
8839 if (requiredsize > outsize)
8840 /* Make room for all additional bytes. */
8841 if (charmapencode_resize(res, respos, requiredsize)) {
8842 Py_DECREF(repunicode);
8843 return -1;
8844 }
8845 memcpy(PyBytes_AsString(*res) + *respos,
8846 PyBytes_AsString(repunicode), repsize);
8847 *respos += repsize;
8848 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008849 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008850 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008851 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008852 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008853 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008854 Py_DECREF(repunicode);
8855 return -1;
8856 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008857 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008858 data = PyUnicode_DATA(repunicode);
8859 kind = PyUnicode_KIND(repunicode);
8860 for (index = 0; index < repsize; index++) {
8861 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8862 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008864 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 return -1;
8866 }
8867 else if (x==enc_FAILED) {
8868 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008869 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 return -1;
8871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008872 }
8873 *inpos = newpos;
8874 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008875 }
8876 return 0;
8877}
8878
Alexander Belopolsky40018472011-02-26 01:02:56 +00008879PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008880_PyUnicode_EncodeCharmap(PyObject *unicode,
8881 PyObject *mapping,
8882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008884 /* output object */
8885 PyObject *res = NULL;
8886 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008887 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008888 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008889 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008890 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008891 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008892 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008893 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008894 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008895 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896
Benjamin Petersonbac79492012-01-14 13:34:47 -05008897 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008898 return NULL;
8899 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008900 data = PyUnicode_DATA(unicode);
8901 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008902
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 /* Default to Latin-1 */
8904 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008905 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008907 /* allocate enough for a simple encoding without
8908 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008909 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910 if (res == NULL)
8911 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008912 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008916 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008918 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 if (x==enc_EXCEPTION) /* error */
8920 goto onError;
8921 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008922 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008924 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 &res, &respos)) {
8926 goto onError;
8927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008928 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 else
8930 /* done with this character => adjust input position */
8931 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008934 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008935 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008936 if (_PyBytes_Resize(&res, respos) < 0)
8937 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008938
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008939 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008940 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941 return res;
8942
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008944 Py_XDECREF(res);
8945 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008946 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 return NULL;
8948}
8949
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008950/* Deprecated */
8951PyObject *
8952PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8953 Py_ssize_t size,
8954 PyObject *mapping,
8955 const char *errors)
8956{
8957 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008958 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008959 if (unicode == NULL)
8960 return NULL;
8961 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8962 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008963 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008964}
8965
Alexander Belopolsky40018472011-02-26 01:02:56 +00008966PyObject *
8967PyUnicode_AsCharmapString(PyObject *unicode,
8968 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969{
8970 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 PyErr_BadArgument();
8972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008974 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975}
8976
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008977/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008978static void
8979make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008981 Py_ssize_t startpos, Py_ssize_t endpos,
8982 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008984 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 *exceptionObject = _PyUnicodeTranslateError_Create(
8986 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 }
8988 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8990 goto onError;
8991 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8992 goto onError;
8993 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8994 goto onError;
8995 return;
8996 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008997 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 }
8999}
9000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009001/* error handling callback helper:
9002 build arguments, call the callback and check the arguments,
9003 put the result into newpos and return the replacement string, which
9004 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009005static PyObject *
9006unicode_translate_call_errorhandler(const char *errors,
9007 PyObject **errorHandler,
9008 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009010 Py_ssize_t startpos, Py_ssize_t endpos,
9011 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009012{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009013 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009014
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009015 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009016 PyObject *restuple;
9017 PyObject *resunicode;
9018
9019 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009021 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009023 }
9024
9025 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009027 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009029
Petr Viktorinffd97532020-02-11 17:46:57 +01009030 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009031 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009034 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 Py_DECREF(restuple);
9036 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009037 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009038 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 &resunicode, &i_newpos)) {
9040 Py_DECREF(restuple);
9041 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009043 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009045 else
9046 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009048 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 Py_DECREF(restuple);
9050 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009051 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009052 Py_INCREF(resunicode);
9053 Py_DECREF(restuple);
9054 return resunicode;
9055}
9056
9057/* Lookup the character ch in the mapping and put the result in result,
9058 which must be decrefed by the caller.
9059 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009060static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009062{
Christian Heimes217cfd12007-12-02 14:31:20 +00009063 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009064 PyObject *x;
9065
9066 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009068 x = PyObject_GetItem(mapping, w);
9069 Py_DECREF(w);
9070 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9072 /* No mapping found means: use 1:1 mapping. */
9073 PyErr_Clear();
9074 *result = NULL;
9075 return 0;
9076 } else
9077 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009078 }
9079 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 *result = x;
9081 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009082 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009083 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009085 if (value < 0 || value > MAX_UNICODE) {
9086 PyErr_Format(PyExc_ValueError,
9087 "character mapping must be in range(0x%x)",
9088 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 Py_DECREF(x);
9090 return -1;
9091 }
9092 *result = x;
9093 return 0;
9094 }
9095 else if (PyUnicode_Check(x)) {
9096 *result = x;
9097 return 0;
9098 }
9099 else {
9100 /* wrong return value */
9101 PyErr_SetString(PyExc_TypeError,
9102 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009103 Py_DECREF(x);
9104 return -1;
9105 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009106}
Victor Stinner1194ea02014-04-04 19:37:40 +02009107
9108/* lookup the character, write the result into the writer.
9109 Return 1 if the result was written into the writer, return 0 if the mapping
9110 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009111static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009112charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9113 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009114{
Victor Stinner1194ea02014-04-04 19:37:40 +02009115 PyObject *item;
9116
9117 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009119
9120 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009122 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009125 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009126 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009127
9128 if (item == Py_None) {
9129 Py_DECREF(item);
9130 return 0;
9131 }
9132
9133 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009134 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9135 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9136 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009137 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9138 Py_DECREF(item);
9139 return -1;
9140 }
9141 Py_DECREF(item);
9142 return 1;
9143 }
9144
9145 if (!PyUnicode_Check(item)) {
9146 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009148 }
9149
9150 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9151 Py_DECREF(item);
9152 return -1;
9153 }
9154
9155 Py_DECREF(item);
9156 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009157}
9158
Victor Stinner89a76ab2014-04-05 11:44:04 +02009159static int
9160unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9161 Py_UCS1 *translate)
9162{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009163 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009164 int ret = 0;
9165
Victor Stinner89a76ab2014-04-05 11:44:04 +02009166 if (charmaptranslate_lookup(ch, mapping, &item)) {
9167 return -1;
9168 }
9169
9170 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009171 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009172 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009173 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009174 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009175 /* not found => default to 1:1 mapping */
9176 translate[ch] = ch;
9177 return 1;
9178 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009179 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009180 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009181 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9182 used it */
9183 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009184 /* invalid character or character outside ASCII:
9185 skip the fast translate */
9186 goto exit;
9187 }
9188 translate[ch] = (Py_UCS1)replace;
9189 }
9190 else if (PyUnicode_Check(item)) {
9191 Py_UCS4 replace;
9192
9193 if (PyUnicode_READY(item) == -1) {
9194 Py_DECREF(item);
9195 return -1;
9196 }
9197 if (PyUnicode_GET_LENGTH(item) != 1)
9198 goto exit;
9199
9200 replace = PyUnicode_READ_CHAR(item, 0);
9201 if (replace > 127)
9202 goto exit;
9203 translate[ch] = (Py_UCS1)replace;
9204 }
9205 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009206 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009207 goto exit;
9208 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009209 ret = 1;
9210
Benjamin Peterson1365de72014-04-07 20:15:41 -04009211 exit:
9212 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009213 return ret;
9214}
9215
9216/* Fast path for ascii => ascii translation. Return 1 if the whole string
9217 was translated into writer, return 0 if the input string was partially
9218 translated into writer, raise an exception and return -1 on error. */
9219static int
9220unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009221 _PyUnicodeWriter *writer, int ignore,
9222 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009223{
Victor Stinner872b2912014-04-05 14:27:07 +02009224 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009225 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009226 const Py_UCS1 *in, *end;
9227 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009228 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009229
Victor Stinner89a76ab2014-04-05 11:44:04 +02009230 len = PyUnicode_GET_LENGTH(input);
9231
Victor Stinner872b2912014-04-05 14:27:07 +02009232 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009233
9234 in = PyUnicode_1BYTE_DATA(input);
9235 end = in + len;
9236
9237 assert(PyUnicode_IS_ASCII(writer->buffer));
9238 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9239 out = PyUnicode_1BYTE_DATA(writer->buffer);
9240
Victor Stinner872b2912014-04-05 14:27:07 +02009241 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009242 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009243 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009244 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009245 int translate = unicode_fast_translate_lookup(mapping, ch,
9246 ascii_table);
9247 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009248 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009249 if (translate == 0)
9250 goto exit;
9251 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009252 }
Victor Stinner872b2912014-04-05 14:27:07 +02009253 if (ch2 == 0xfe) {
9254 if (ignore)
9255 continue;
9256 goto exit;
9257 }
9258 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009259 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009260 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009261 }
Victor Stinner872b2912014-04-05 14:27:07 +02009262 res = 1;
9263
9264exit:
9265 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009266 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009267 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009268}
9269
Victor Stinner3222da22015-10-01 22:07:32 +02009270static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271_PyUnicode_TranslateCharmap(PyObject *input,
9272 PyObject *mapping,
9273 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009276 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 Py_ssize_t size, i;
9278 int kind;
9279 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009280 _PyUnicodeWriter writer;
9281 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009282 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009283 PyObject *errorHandler = NULL;
9284 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009285 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009286 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009287
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009289 PyErr_BadArgument();
9290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 if (PyUnicode_READY(input) == -1)
9294 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009295 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 kind = PyUnicode_KIND(input);
9297 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009299 if (size == 0)
9300 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009302 /* allocate enough for a simple 1:1 translation without
9303 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009304 _PyUnicodeWriter_Init(&writer);
9305 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307
Victor Stinner872b2912014-04-05 14:27:07 +02009308 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9309
Victor Stinner33798672016-03-01 21:59:58 +01009310 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009311 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009312 if (PyUnicode_IS_ASCII(input)) {
9313 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9314 if (res < 0) {
9315 _PyUnicodeWriter_Dealloc(&writer);
9316 return NULL;
9317 }
9318 if (res == 1)
9319 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009320 }
Victor Stinner33798672016-03-01 21:59:58 +01009321 else {
9322 i = 0;
9323 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009327 int translate;
9328 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9329 Py_ssize_t newpos;
9330 /* startpos for collecting untranslatable chars */
9331 Py_ssize_t collstart;
9332 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009333 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334
Victor Stinner1194ea02014-04-04 19:37:40 +02009335 ch = PyUnicode_READ(kind, data, i);
9336 translate = charmaptranslate_output(ch, mapping, &writer);
9337 if (translate < 0)
9338 goto onError;
9339
9340 if (translate != 0) {
9341 /* it worked => adjust input pointer */
9342 ++i;
9343 continue;
9344 }
9345
9346 /* untranslatable character */
9347 collstart = i;
9348 collend = i+1;
9349
9350 /* find all untranslatable characters */
9351 while (collend < size) {
9352 PyObject *x;
9353 ch = PyUnicode_READ(kind, data, collend);
9354 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009355 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009356 Py_XDECREF(x);
9357 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009359 ++collend;
9360 }
9361
9362 if (ignore) {
9363 i = collend;
9364 }
9365 else {
9366 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9367 reason, input, &exc,
9368 collstart, collend, &newpos);
9369 if (repunicode == NULL)
9370 goto onError;
9371 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009373 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009374 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009375 Py_DECREF(repunicode);
9376 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009377 }
9378 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009379 Py_XDECREF(exc);
9380 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009381 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009384 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009385 Py_XDECREF(exc);
9386 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 return NULL;
9388}
9389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390/* Deprecated. Use PyUnicode_Translate instead. */
9391PyObject *
9392PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9393 Py_ssize_t size,
9394 PyObject *mapping,
9395 const char *errors)
9396{
Christian Heimes5f520f42012-09-11 14:03:25 +02009397 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009398 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 if (!unicode)
9400 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009401 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9402 Py_DECREF(unicode);
9403 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404}
9405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406PyObject *
9407PyUnicode_Translate(PyObject *str,
9408 PyObject *mapping,
9409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009411 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009412 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009413 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414}
Tim Petersced69f82003-09-16 20:30:58 +00009415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416PyObject *
9417_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9418{
9419 if (!PyUnicode_Check(unicode)) {
9420 PyErr_BadInternalCall();
9421 return NULL;
9422 }
9423 if (PyUnicode_READY(unicode) == -1)
9424 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009425 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 /* If the string is already ASCII, just return the same string */
9427 Py_INCREF(unicode);
9428 return unicode;
9429 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009430
9431 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9432 PyObject *result = PyUnicode_New(len, 127);
9433 if (result == NULL) {
9434 return NULL;
9435 }
9436
9437 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9438 int kind = PyUnicode_KIND(unicode);
9439 const void *data = PyUnicode_DATA(unicode);
9440 Py_ssize_t i;
9441 for (i = 0; i < len; ++i) {
9442 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9443 if (ch < 127) {
9444 out[i] = ch;
9445 }
9446 else if (Py_UNICODE_ISSPACE(ch)) {
9447 out[i] = ' ';
9448 }
9449 else {
9450 int decimal = Py_UNICODE_TODECIMAL(ch);
9451 if (decimal < 0) {
9452 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009453 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009454 _PyUnicode_LENGTH(result) = i + 1;
9455 break;
9456 }
9457 out[i] = '0' + decimal;
9458 }
9459 }
9460
INADA Naoki16dfca42018-07-14 12:06:43 +09009461 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009462 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463}
9464
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009465PyObject *
9466PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9467 Py_ssize_t length)
9468{
Victor Stinnerf0124502011-11-21 23:12:56 +01009469 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009470 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009471 Py_UCS4 maxchar;
9472 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009473 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009474
Victor Stinner99d7ad02012-02-22 13:37:39 +01009475 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009476 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009477 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009478 if (ch > 127) {
9479 int decimal = Py_UNICODE_TODECIMAL(ch);
9480 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009481 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009482 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009483 }
9484 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009485
9486 /* Copy to a new string */
9487 decimal = PyUnicode_New(length, maxchar);
9488 if (decimal == NULL)
9489 return decimal;
9490 kind = PyUnicode_KIND(decimal);
9491 data = PyUnicode_DATA(decimal);
9492 /* Iterate over code points */
9493 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009494 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009495 if (ch > 127) {
9496 int decimal = Py_UNICODE_TODECIMAL(ch);
9497 if (decimal >= 0)
9498 ch = '0' + decimal;
9499 }
9500 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009502 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009503}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009504/* --- Decimal Encoder ---------------------------------------------------- */
9505
Alexander Belopolsky40018472011-02-26 01:02:56 +00009506int
9507PyUnicode_EncodeDecimal(Py_UNICODE *s,
9508 Py_ssize_t length,
9509 char *output,
9510 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009511{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009512 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009513 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009514 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009515 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009516
9517 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 PyErr_BadArgument();
9519 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009520 }
9521
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009522 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009523 if (unicode == NULL)
9524 return -1;
9525
Victor Stinner42bf7752011-11-21 22:52:58 +01009526 kind = PyUnicode_KIND(unicode);
9527 data = PyUnicode_DATA(unicode);
9528
Victor Stinnerb84d7232011-11-22 01:50:07 +01009529 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009530 PyObject *exc;
9531 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009533 Py_ssize_t startpos;
9534
9535 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009536
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009538 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009539 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009541 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 decimal = Py_UNICODE_TODECIMAL(ch);
9543 if (decimal >= 0) {
9544 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009545 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 continue;
9547 }
9548 if (0 < ch && ch < 256) {
9549 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009550 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 continue;
9552 }
Victor Stinner6345be92011-11-25 20:09:01 +01009553
Victor Stinner42bf7752011-11-21 22:52:58 +01009554 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009555 exc = NULL;
9556 raise_encode_exception(&exc, "decimal", unicode,
9557 startpos, startpos+1,
9558 "invalid decimal Unicode string");
9559 Py_XDECREF(exc);
9560 Py_DECREF(unicode);
9561 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009562 }
9563 /* 0-terminate the output string */
9564 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009565 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009566 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009567}
9568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569/* --- Helpers ------------------------------------------------------------ */
9570
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009571/* helper macro to fixup start/end slice values */
9572#define ADJUST_INDICES(start, end, len) \
9573 if (end > len) \
9574 end = len; \
9575 else if (end < 0) { \
9576 end += len; \
9577 if (end < 0) \
9578 end = 0; \
9579 } \
9580 if (start < 0) { \
9581 start += len; \
9582 if (start < 0) \
9583 start = 0; \
9584 }
9585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009587any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009589 Py_ssize_t end,
9590 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009593 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 Py_ssize_t len1, len2, result;
9595
9596 kind1 = PyUnicode_KIND(s1);
9597 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009598 if (kind1 < kind2)
9599 return -1;
9600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 len1 = PyUnicode_GET_LENGTH(s1);
9602 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009603 ADJUST_INDICES(start, end, len1);
9604 if (end - start < len2)
9605 return -1;
9606
9607 buf1 = PyUnicode_DATA(s1);
9608 buf2 = PyUnicode_DATA(s2);
9609 if (len2 == 1) {
9610 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9611 result = findchar((const char *)buf1 + kind1*start,
9612 kind1, end - start, ch, direction);
9613 if (result == -1)
9614 return -1;
9615 else
9616 return start + result;
9617 }
9618
9619 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009620 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009621 if (!buf2)
9622 return -2;
9623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624
Victor Stinner794d5672011-10-10 03:21:36 +02009625 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009626 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009627 case PyUnicode_1BYTE_KIND:
9628 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9629 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9630 else
9631 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9632 break;
9633 case PyUnicode_2BYTE_KIND:
9634 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9635 break;
9636 case PyUnicode_4BYTE_KIND:
9637 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9638 break;
9639 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009640 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009641 }
9642 }
9643 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009644 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009645 case PyUnicode_1BYTE_KIND:
9646 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9647 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9648 else
9649 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9650 break;
9651 case PyUnicode_2BYTE_KIND:
9652 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9653 break;
9654 case PyUnicode_4BYTE_KIND:
9655 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9656 break;
9657 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009658 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 }
9661
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009662 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009663 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009664 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665
9666 return result;
9667}
9668
Victor Stinner59423e32018-11-26 13:40:01 +01009669/* _PyUnicode_InsertThousandsGrouping() helper functions */
9670#include "stringlib/localeutil.h"
9671
9672/**
9673 * InsertThousandsGrouping:
9674 * @writer: Unicode writer.
9675 * @n_buffer: Number of characters in @buffer.
9676 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9677 * @d_pos: Start of digits string.
9678 * @n_digits: The number of digits in the string, in which we want
9679 * to put the grouping chars.
9680 * @min_width: The minimum width of the digits in the output string.
9681 * Output will be zero-padded on the left to fill.
9682 * @grouping: see definition in localeconv().
9683 * @thousands_sep: see definition in localeconv().
9684 *
9685 * There are 2 modes: counting and filling. If @writer is NULL,
9686 * we are in counting mode, else filling mode.
9687 * If counting, the required buffer size is returned.
9688 * If filling, we know the buffer will be large enough, so we don't
9689 * need to pass in the buffer size.
9690 * Inserts thousand grouping characters (as defined by grouping and
9691 * thousands_sep) into @writer.
9692 *
9693 * Return value: -1 on error, number of characters otherwise.
9694 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009696_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009697 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009698 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009699 PyObject *digits,
9700 Py_ssize_t d_pos,
9701 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009702 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009703 const char *grouping,
9704 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009705 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706{
Xtreak3f7983a2019-01-07 20:39:14 +05309707 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009708 if (writer) {
9709 assert(digits != NULL);
9710 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009711 }
9712 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009713 assert(digits == NULL);
9714 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009715 }
Victor Stinner59423e32018-11-26 13:40:01 +01009716 assert(0 <= d_pos);
9717 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009718 assert(grouping != NULL);
9719
9720 if (digits != NULL) {
9721 if (PyUnicode_READY(digits) == -1) {
9722 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009723 }
Victor Stinner59423e32018-11-26 13:40:01 +01009724 }
9725 if (PyUnicode_READY(thousands_sep) == -1) {
9726 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009727 }
9728
Victor Stinner59423e32018-11-26 13:40:01 +01009729 Py_ssize_t count = 0;
9730 Py_ssize_t n_zeros;
9731 int loop_broken = 0;
9732 int use_separator = 0; /* First time through, don't append the
9733 separator. They only go between
9734 groups. */
9735 Py_ssize_t buffer_pos;
9736 Py_ssize_t digits_pos;
9737 Py_ssize_t len;
9738 Py_ssize_t n_chars;
9739 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9740 be looked at */
9741 /* A generator that returns all of the grouping widths, until it
9742 returns 0. */
9743 GroupGenerator groupgen;
9744 GroupGenerator_init(&groupgen, grouping);
9745 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9746
9747 /* if digits are not grouped, thousands separator
9748 should be an empty string */
9749 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9750
9751 digits_pos = d_pos + n_digits;
9752 if (writer) {
9753 buffer_pos = writer->pos + n_buffer;
9754 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9755 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 }
Victor Stinner59423e32018-11-26 13:40:01 +01009757 else {
9758 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009759 }
Victor Stinner59423e32018-11-26 13:40:01 +01009760
9761 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009762 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009763 }
Victor Stinner59423e32018-11-26 13:40:01 +01009764
9765 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9766 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9767 n_zeros = Py_MAX(0, len - remaining);
9768 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9769
9770 /* Use n_zero zero's and n_chars chars */
9771
9772 /* Count only, don't do anything. */
9773 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9774
9775 /* Copy into the writer. */
9776 InsertThousandsGrouping_fill(writer, &buffer_pos,
9777 digits, &digits_pos,
9778 n_chars, n_zeros,
9779 use_separator ? thousands_sep : NULL,
9780 thousands_sep_len, maxchar);
9781
9782 /* Use a separator next time. */
9783 use_separator = 1;
9784
9785 remaining -= n_chars;
9786 min_width -= len;
9787
9788 if (remaining <= 0 && min_width <= 0) {
9789 loop_broken = 1;
9790 break;
9791 }
9792 min_width -= thousands_sep_len;
9793 }
9794 if (!loop_broken) {
9795 /* We left the loop without using a break statement. */
9796
9797 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9798 n_zeros = Py_MAX(0, len - remaining);
9799 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9800
9801 /* Use n_zero zero's and n_chars chars */
9802 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9803
9804 /* Copy into the writer. */
9805 InsertThousandsGrouping_fill(writer, &buffer_pos,
9806 digits, &digits_pos,
9807 n_chars, n_zeros,
9808 use_separator ? thousands_sep : NULL,
9809 thousands_sep_len, maxchar);
9810 }
9811 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812}
9813
9814
Alexander Belopolsky40018472011-02-26 01:02:56 +00009815Py_ssize_t
9816PyUnicode_Count(PyObject *str,
9817 PyObject *substr,
9818 Py_ssize_t start,
9819 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009821 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009822 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009823 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009825
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009826 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009827 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009828
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009829 kind1 = PyUnicode_KIND(str);
9830 kind2 = PyUnicode_KIND(substr);
9831 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009832 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009833
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009834 len1 = PyUnicode_GET_LENGTH(str);
9835 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009837 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009838 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009839
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009840 buf1 = PyUnicode_DATA(str);
9841 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009842 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009843 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009844 if (!buf2)
9845 goto onError;
9846 }
9847
9848 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009850 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009851 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009852 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009853 buf2, len2, PY_SSIZE_T_MAX
9854 );
9855 else
9856 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009857 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009858 buf2, len2, PY_SSIZE_T_MAX
9859 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 break;
9861 case PyUnicode_2BYTE_KIND:
9862 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009863 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 buf2, len2, PY_SSIZE_T_MAX
9865 );
9866 break;
9867 case PyUnicode_4BYTE_KIND:
9868 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009869 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 buf2, len2, PY_SSIZE_T_MAX
9871 );
9872 break;
9873 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009874 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009876
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009877 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009878 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009879 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009883 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9884 if (kind2 != kind1)
9885 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887}
9888
Alexander Belopolsky40018472011-02-26 01:02:56 +00009889Py_ssize_t
9890PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009891 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009892 Py_ssize_t start,
9893 Py_ssize_t end,
9894 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009896 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009897 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009898
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009899 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900}
9901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902Py_ssize_t
9903PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9904 Py_ssize_t start, Py_ssize_t end,
9905 int direction)
9906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009908 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 if (PyUnicode_READY(str) == -1)
9910 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009911 len = PyUnicode_GET_LENGTH(str);
9912 ADJUST_INDICES(start, end, len);
9913 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009914 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009916 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9917 kind, end-start, ch, direction);
9918 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009920 else
9921 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922}
9923
Alexander Belopolsky40018472011-02-26 01:02:56 +00009924static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009925tailmatch(PyObject *self,
9926 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009927 Py_ssize_t start,
9928 Py_ssize_t end,
9929 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 int kind_self;
9932 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009933 const void *data_self;
9934 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 Py_ssize_t offset;
9936 Py_ssize_t i;
9937 Py_ssize_t end_sub;
9938
9939 if (PyUnicode_READY(self) == -1 ||
9940 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009941 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9944 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009946 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009948 if (PyUnicode_GET_LENGTH(substring) == 0)
9949 return 1;
9950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 kind_self = PyUnicode_KIND(self);
9952 data_self = PyUnicode_DATA(self);
9953 kind_sub = PyUnicode_KIND(substring);
9954 data_sub = PyUnicode_DATA(substring);
9955 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9956
9957 if (direction > 0)
9958 offset = end;
9959 else
9960 offset = start;
9961
9962 if (PyUnicode_READ(kind_self, data_self, offset) ==
9963 PyUnicode_READ(kind_sub, data_sub, 0) &&
9964 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9965 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9966 /* If both are of the same kind, memcmp is sufficient */
9967 if (kind_self == kind_sub) {
9968 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009969 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 data_sub,
9971 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009972 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009974 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 else {
9976 /* We do not need to compare 0 and len(substring)-1 because
9977 the if statement above ensured already that they are equal
9978 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 for (i = 1; i < end_sub; ++i) {
9980 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9981 PyUnicode_READ(kind_sub, data_sub, i))
9982 return 0;
9983 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009984 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986 }
9987
9988 return 0;
9989}
9990
Alexander Belopolsky40018472011-02-26 01:02:56 +00009991Py_ssize_t
9992PyUnicode_Tailmatch(PyObject *str,
9993 PyObject *substr,
9994 Py_ssize_t start,
9995 Py_ssize_t end,
9996 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009998 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010000
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010001 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002}
10003
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010004static PyObject *
10005ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010007 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010008 const char *data = PyUnicode_DATA(self);
10009 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010010 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010011
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010012 res = PyUnicode_New(len, 127);
10013 if (res == NULL)
10014 return NULL;
10015 resdata = PyUnicode_DATA(res);
10016 if (lower)
10017 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010019 _Py_bytes_upper(resdata, data, len);
10020 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021}
10022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010024handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010026 Py_ssize_t j;
10027 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010028 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010029 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010030
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010031 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10032
10033 where ! is a negation and \p{xxx} is a character with property xxx.
10034 */
10035 for (j = i - 1; j >= 0; j--) {
10036 c = PyUnicode_READ(kind, data, j);
10037 if (!_PyUnicode_IsCaseIgnorable(c))
10038 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010040 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10041 if (final_sigma) {
10042 for (j = i + 1; j < length; j++) {
10043 c = PyUnicode_READ(kind, data, j);
10044 if (!_PyUnicode_IsCaseIgnorable(c))
10045 break;
10046 }
10047 final_sigma = j == length || !_PyUnicode_IsCased(c);
10048 }
10049 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050}
10051
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010052static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010053lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010054 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056 /* Obscure special case. */
10057 if (c == 0x3A3) {
10058 mapped[0] = handle_capital_sigma(kind, data, length, i);
10059 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010061 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062}
10063
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010064static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010065do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010067 Py_ssize_t i, k = 0;
10068 int n_res, j;
10069 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010070
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010071 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010072 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010073 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010074 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010075 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077 for (i = 1; i < length; i++) {
10078 c = PyUnicode_READ(kind, data, i);
10079 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10080 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010081 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010082 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010083 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010084 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010085 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086}
10087
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010088static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010089do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010090 Py_ssize_t i, k = 0;
10091
10092 for (i = 0; i < length; i++) {
10093 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10094 int n_res, j;
10095 if (Py_UNICODE_ISUPPER(c)) {
10096 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10097 }
10098 else if (Py_UNICODE_ISLOWER(c)) {
10099 n_res = _PyUnicode_ToUpperFull(c, mapped);
10100 }
10101 else {
10102 n_res = 1;
10103 mapped[0] = c;
10104 }
10105 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010106 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010107 res[k++] = mapped[j];
10108 }
10109 }
10110 return k;
10111}
10112
10113static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010114do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010115 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010117 Py_ssize_t i, k = 0;
10118
10119 for (i = 0; i < length; i++) {
10120 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10121 int n_res, j;
10122 if (lower)
10123 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10124 else
10125 n_res = _PyUnicode_ToUpperFull(c, mapped);
10126 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010127 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010128 res[k++] = mapped[j];
10129 }
10130 }
10131 return k;
10132}
10133
10134static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010135do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010136{
10137 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10138}
10139
10140static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010141do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010142{
10143 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10144}
10145
Benjamin Petersone51757f2012-01-12 21:10:29 -050010146static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010147do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010148{
10149 Py_ssize_t i, k = 0;
10150
10151 for (i = 0; i < length; i++) {
10152 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10153 Py_UCS4 mapped[3];
10154 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10155 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010156 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010157 res[k++] = mapped[j];
10158 }
10159 }
10160 return k;
10161}
10162
10163static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010164do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010165{
10166 Py_ssize_t i, k = 0;
10167 int previous_is_cased;
10168
10169 previous_is_cased = 0;
10170 for (i = 0; i < length; i++) {
10171 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10172 Py_UCS4 mapped[3];
10173 int n_res, j;
10174
10175 if (previous_is_cased)
10176 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10177 else
10178 n_res = _PyUnicode_ToTitleFull(c, mapped);
10179
10180 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010181 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010182 res[k++] = mapped[j];
10183 }
10184
10185 previous_is_cased = _PyUnicode_IsCased(c);
10186 }
10187 return k;
10188}
10189
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010190static PyObject *
10191case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010192 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010193{
10194 PyObject *res = NULL;
10195 Py_ssize_t length, newlength = 0;
10196 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010197 const void *data;
10198 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010199 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10200
Benjamin Petersoneea48462012-01-16 14:28:50 -050010201 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010202
10203 kind = PyUnicode_KIND(self);
10204 data = PyUnicode_DATA(self);
10205 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010206 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010207 PyErr_SetString(PyExc_OverflowError, "string is too long");
10208 return NULL;
10209 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010210 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010211 if (tmp == NULL)
10212 return PyErr_NoMemory();
10213 newlength = perform(kind, data, length, tmp, &maxchar);
10214 res = PyUnicode_New(newlength, maxchar);
10215 if (res == NULL)
10216 goto leave;
10217 tmpend = tmp + newlength;
10218 outdata = PyUnicode_DATA(res);
10219 outkind = PyUnicode_KIND(res);
10220 switch (outkind) {
10221 case PyUnicode_1BYTE_KIND:
10222 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10223 break;
10224 case PyUnicode_2BYTE_KIND:
10225 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10226 break;
10227 case PyUnicode_4BYTE_KIND:
10228 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10229 break;
10230 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010231 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010232 }
10233 leave:
10234 PyMem_FREE(tmp);
10235 return res;
10236}
10237
Tim Peters8ce9f162004-08-27 01:49:32 +000010238PyObject *
10239PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010241 PyObject *res;
10242 PyObject *fseq;
10243 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010244 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010246 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010247 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010248 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010249 }
10250
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010251 /* NOTE: the following code can't call back into Python code,
10252 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010253 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010254
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010255 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010256 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010257 res = _PyUnicode_JoinArray(separator, items, seqlen);
10258 Py_DECREF(fseq);
10259 return res;
10260}
10261
10262PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010263_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010264{
10265 PyObject *res = NULL; /* the result */
10266 PyObject *sep = NULL;
10267 Py_ssize_t seplen;
10268 PyObject *item;
10269 Py_ssize_t sz, i, res_offset;
10270 Py_UCS4 maxchar;
10271 Py_UCS4 item_maxchar;
10272 int use_memcpy;
10273 unsigned char *res_data = NULL, *sep_data = NULL;
10274 PyObject *last_obj;
10275 unsigned int kind = 0;
10276
Tim Peters05eba1f2004-08-27 21:32:02 +000010277 /* If empty sequence, return u"". */
10278 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010279 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010280 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010281
Tim Peters05eba1f2004-08-27 21:32:02 +000010282 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010283 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010284 if (seqlen == 1) {
10285 if (PyUnicode_CheckExact(items[0])) {
10286 res = items[0];
10287 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010288 return res;
10289 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010290 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010291 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010292 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010293 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010294 /* Set up sep and seplen */
10295 if (separator == NULL) {
10296 /* fall back to a blank space separator */
10297 sep = PyUnicode_FromOrdinal(' ');
10298 if (!sep)
10299 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010300 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010301 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010302 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010303 else {
10304 if (!PyUnicode_Check(separator)) {
10305 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010306 "separator: expected str instance,"
10307 " %.80s found",
10308 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010309 goto onError;
10310 }
10311 if (PyUnicode_READY(separator))
10312 goto onError;
10313 sep = separator;
10314 seplen = PyUnicode_GET_LENGTH(separator);
10315 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10316 /* inc refcount to keep this code path symmetric with the
10317 above case of a blank separator */
10318 Py_INCREF(sep);
10319 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010320 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010321 }
10322
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010323 /* There are at least two things to join, or else we have a subclass
10324 * of str in the sequence.
10325 * Do a pre-pass to figure out the total amount of space we'll
10326 * need (sz), and see whether all argument are strings.
10327 */
10328 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010329#ifdef Py_DEBUG
10330 use_memcpy = 0;
10331#else
10332 use_memcpy = 1;
10333#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010334 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010335 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010336 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010337 if (!PyUnicode_Check(item)) {
10338 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010339 "sequence item %zd: expected str instance,"
10340 " %.80s found",
10341 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010342 goto onError;
10343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 if (PyUnicode_READY(item) == -1)
10345 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010346 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010348 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010349 if (i != 0) {
10350 add_sz += seplen;
10351 }
10352 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010353 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010355 goto onError;
10356 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010357 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010358 if (use_memcpy && last_obj != NULL) {
10359 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10360 use_memcpy = 0;
10361 }
10362 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010363 }
Tim Petersced69f82003-09-16 20:30:58 +000010364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010366 if (res == NULL)
10367 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010368
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010369 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010370#ifdef Py_DEBUG
10371 use_memcpy = 0;
10372#else
10373 if (use_memcpy) {
10374 res_data = PyUnicode_1BYTE_DATA(res);
10375 kind = PyUnicode_KIND(res);
10376 if (seplen != 0)
10377 sep_data = PyUnicode_1BYTE_DATA(sep);
10378 }
10379#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010380 if (use_memcpy) {
10381 for (i = 0; i < seqlen; ++i) {
10382 Py_ssize_t itemlen;
10383 item = items[i];
10384
10385 /* Copy item, and maybe the separator. */
10386 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010387 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010388 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010389 kind * seplen);
10390 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010391 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010392
10393 itemlen = PyUnicode_GET_LENGTH(item);
10394 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010395 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010396 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010397 kind * itemlen);
10398 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010399 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010400 }
10401 assert(res_data == PyUnicode_1BYTE_DATA(res)
10402 + kind * PyUnicode_GET_LENGTH(res));
10403 }
10404 else {
10405 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10406 Py_ssize_t itemlen;
10407 item = items[i];
10408
10409 /* Copy item, and maybe the separator. */
10410 if (i && seplen != 0) {
10411 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10412 res_offset += seplen;
10413 }
10414
10415 itemlen = PyUnicode_GET_LENGTH(item);
10416 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010417 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010418 res_offset += itemlen;
10419 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010420 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010421 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010422 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010425 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010430 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 return NULL;
10432}
10433
Victor Stinnerd3f08822012-05-29 12:57:52 +020010434void
10435_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10436 Py_UCS4 fill_char)
10437{
10438 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010439 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010440 assert(PyUnicode_IS_READY(unicode));
10441 assert(unicode_modifiable(unicode));
10442 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10443 assert(start >= 0);
10444 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010445 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010446}
10447
Victor Stinner3fe55312012-01-04 00:33:50 +010010448Py_ssize_t
10449PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10450 Py_UCS4 fill_char)
10451{
10452 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010453
10454 if (!PyUnicode_Check(unicode)) {
10455 PyErr_BadInternalCall();
10456 return -1;
10457 }
10458 if (PyUnicode_READY(unicode) == -1)
10459 return -1;
10460 if (unicode_check_modifiable(unicode))
10461 return -1;
10462
Victor Stinnerd3f08822012-05-29 12:57:52 +020010463 if (start < 0) {
10464 PyErr_SetString(PyExc_IndexError, "string index out of range");
10465 return -1;
10466 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010467 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10468 PyErr_SetString(PyExc_ValueError,
10469 "fill character is bigger than "
10470 "the string maximum character");
10471 return -1;
10472 }
10473
10474 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10475 length = Py_MIN(maxlen, length);
10476 if (length <= 0)
10477 return 0;
10478
Victor Stinnerd3f08822012-05-29 12:57:52 +020010479 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010480 return length;
10481}
10482
Victor Stinner9310abb2011-10-05 00:59:23 +020010483static PyObject *
10484pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010485 Py_ssize_t left,
10486 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 PyObject *u;
10490 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010491 int kind;
10492 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493
10494 if (left < 0)
10495 left = 0;
10496 if (right < 0)
10497 right = 0;
10498
Victor Stinnerc4b49542011-12-11 22:44:26 +010010499 if (left == 0 && right == 0)
10500 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10503 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010504 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10505 return NULL;
10506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010508 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010510 if (!u)
10511 return NULL;
10512
10513 kind = PyUnicode_KIND(u);
10514 data = PyUnicode_DATA(u);
10515 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010516 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010517 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010518 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010519 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010520 assert(_PyUnicode_CheckConsistency(u, 1));
10521 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522}
10523
Alexander Belopolsky40018472011-02-26 01:02:56 +000010524PyObject *
10525PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010529 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
Benjamin Petersonead6b532011-12-20 17:23:42 -060010532 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010534 if (PyUnicode_IS_ASCII(string))
10535 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010536 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010537 PyUnicode_GET_LENGTH(string), keepends);
10538 else
10539 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010540 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010541 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 break;
10543 case PyUnicode_2BYTE_KIND:
10544 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010545 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 PyUnicode_GET_LENGTH(string), keepends);
10547 break;
10548 case PyUnicode_4BYTE_KIND:
10549 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010550 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 PyUnicode_GET_LENGTH(string), keepends);
10552 break;
10553 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010554 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557}
10558
Alexander Belopolsky40018472011-02-26 01:02:56 +000010559static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010560split(PyObject *self,
10561 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010562 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010564 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010565 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 Py_ssize_t len1, len2;
10567 PyObject* out;
10568
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010570 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (PyUnicode_READY(self) == -1)
10573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010576 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010578 if (PyUnicode_IS_ASCII(self))
10579 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010580 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010581 PyUnicode_GET_LENGTH(self), maxcount
10582 );
10583 else
10584 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010585 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010586 PyUnicode_GET_LENGTH(self), maxcount
10587 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 case PyUnicode_2BYTE_KIND:
10589 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010590 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 PyUnicode_GET_LENGTH(self), maxcount
10592 );
10593 case PyUnicode_4BYTE_KIND:
10594 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010595 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 PyUnicode_GET_LENGTH(self), maxcount
10597 );
10598 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010599 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 }
10601
10602 if (PyUnicode_READY(substring) == -1)
10603 return NULL;
10604
10605 kind1 = PyUnicode_KIND(self);
10606 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 len1 = PyUnicode_GET_LENGTH(self);
10608 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010609 if (kind1 < kind2 || len1 < len2) {
10610 out = PyList_New(1);
10611 if (out == NULL)
10612 return NULL;
10613 Py_INCREF(self);
10614 PyList_SET_ITEM(out, 0, self);
10615 return out;
10616 }
10617 buf1 = PyUnicode_DATA(self);
10618 buf2 = PyUnicode_DATA(substring);
10619 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010620 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010621 if (!buf2)
10622 return NULL;
10623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010627 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10628 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010629 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010630 else
10631 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010632 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 break;
10634 case PyUnicode_2BYTE_KIND:
10635 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010636 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 break;
10638 case PyUnicode_4BYTE_KIND:
10639 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010640 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 break;
10642 default:
10643 out = NULL;
10644 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010645 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010646 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010647 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649}
10650
Alexander Belopolsky40018472011-02-26 01:02:56 +000010651static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010652rsplit(PyObject *self,
10653 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010654 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010655{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010656 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010657 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 Py_ssize_t len1, len2;
10659 PyObject* out;
10660
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010661 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010662 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (PyUnicode_READY(self) == -1)
10665 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010668 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010670 if (PyUnicode_IS_ASCII(self))
10671 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010672 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010673 PyUnicode_GET_LENGTH(self), maxcount
10674 );
10675 else
10676 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010677 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010678 PyUnicode_GET_LENGTH(self), maxcount
10679 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 case PyUnicode_2BYTE_KIND:
10681 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010682 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 PyUnicode_GET_LENGTH(self), maxcount
10684 );
10685 case PyUnicode_4BYTE_KIND:
10686 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010687 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 PyUnicode_GET_LENGTH(self), maxcount
10689 );
10690 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010691 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 }
10693
10694 if (PyUnicode_READY(substring) == -1)
10695 return NULL;
10696
10697 kind1 = PyUnicode_KIND(self);
10698 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 len1 = PyUnicode_GET_LENGTH(self);
10700 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010701 if (kind1 < kind2 || len1 < len2) {
10702 out = PyList_New(1);
10703 if (out == NULL)
10704 return NULL;
10705 Py_INCREF(self);
10706 PyList_SET_ITEM(out, 0, self);
10707 return out;
10708 }
10709 buf1 = PyUnicode_DATA(self);
10710 buf2 = PyUnicode_DATA(substring);
10711 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010712 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010713 if (!buf2)
10714 return NULL;
10715 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010717 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010719 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10720 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010721 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010722 else
10723 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010724 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 break;
10726 case PyUnicode_2BYTE_KIND:
10727 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010728 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 break;
10730 case PyUnicode_4BYTE_KIND:
10731 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010732 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 break;
10734 default:
10735 out = NULL;
10736 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010737 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010738 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010739 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 return out;
10741}
10742
10743static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010744anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10745 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010747 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010749 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10750 return asciilib_find(buf1, len1, buf2, len2, offset);
10751 else
10752 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 case PyUnicode_2BYTE_KIND:
10754 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10755 case PyUnicode_4BYTE_KIND:
10756 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10757 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010758 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759}
10760
10761static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010762anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10763 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010765 switch (kind) {
10766 case PyUnicode_1BYTE_KIND:
10767 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10768 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10769 else
10770 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10771 case PyUnicode_2BYTE_KIND:
10772 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10773 case PyUnicode_4BYTE_KIND:
10774 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10775 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010776 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010777}
10778
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010779static void
10780replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10781 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10782{
10783 int kind = PyUnicode_KIND(u);
10784 void *data = PyUnicode_DATA(u);
10785 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10786 if (kind == PyUnicode_1BYTE_KIND) {
10787 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10788 (Py_UCS1 *)data + len,
10789 u1, u2, maxcount);
10790 }
10791 else if (kind == PyUnicode_2BYTE_KIND) {
10792 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10793 (Py_UCS2 *)data + len,
10794 u1, u2, maxcount);
10795 }
10796 else {
10797 assert(kind == PyUnicode_4BYTE_KIND);
10798 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10799 (Py_UCS4 *)data + len,
10800 u1, u2, maxcount);
10801 }
10802}
10803
Alexander Belopolsky40018472011-02-26 01:02:56 +000010804static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805replace(PyObject *self, PyObject *str1,
10806 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010809 const char *sbuf = PyUnicode_DATA(self);
10810 const void *buf1 = PyUnicode_DATA(str1);
10811 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 int srelease = 0, release1 = 0, release2 = 0;
10813 int skind = PyUnicode_KIND(self);
10814 int kind1 = PyUnicode_KIND(str1);
10815 int kind2 = PyUnicode_KIND(str2);
10816 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10817 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10818 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010819 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010820 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010822 if (slen < len1)
10823 goto nothing;
10824
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010826 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010827 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010828 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829
Victor Stinner59de0ee2011-10-07 10:01:28 +020010830 if (str1 == str2)
10831 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832
Victor Stinner49a0a212011-10-12 23:46:10 +020010833 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010834 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10835 if (maxchar < maxchar_str1)
10836 /* substring too wide to be present */
10837 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010838 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10839 /* Replacing str1 with str2 may cause a maxchar reduction in the
10840 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010841 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010842 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010845 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010847 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010849 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010850 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010851 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010852
Victor Stinner69ed0f42013-04-09 21:48:24 +020010853 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010854 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010855 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010856 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010857 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010859 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010861
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010862 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10863 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010864 }
10865 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010866 int rkind = skind;
10867 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010868 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 if (kind1 < rkind) {
10871 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010872 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 if (!buf1) goto error;
10874 release1 = 1;
10875 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010876 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010877 if (i < 0)
10878 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 if (rkind > kind2) {
10880 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010881 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010882 if (!buf2) goto error;
10883 release2 = 1;
10884 }
10885 else if (rkind < kind2) {
10886 /* widen self and buf1 */
10887 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010888 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010889 assert(buf1 != PyUnicode_DATA(str1));
10890 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010891 buf1 = PyUnicode_DATA(str1);
10892 release1 = 0;
10893 }
10894 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (!sbuf) goto error;
10896 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010897 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 if (!buf1) goto error;
10899 release1 = 1;
10900 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010901 u = PyUnicode_New(slen, maxchar);
10902 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010904 assert(PyUnicode_KIND(u) == rkind);
10905 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010906
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010907 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010908 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010909 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010911 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010913
10914 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010915 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010916 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010917 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010918 if (i == -1)
10919 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010920 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010922 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010923 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010926 }
10927 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010929 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 int rkind = skind;
10931 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010934 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010935 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 if (!buf1) goto error;
10937 release1 = 1;
10938 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010939 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010940 if (n == 0)
10941 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010943 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010944 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 if (!buf2) goto error;
10946 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010949 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010951 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 if (!sbuf) goto error;
10953 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010954 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010955 assert(buf1 != PyUnicode_DATA(str1));
10956 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010957 buf1 = PyUnicode_DATA(str1);
10958 release1 = 0;
10959 }
10960 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 if (!buf1) goto error;
10962 release1 = 1;
10963 }
10964 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10965 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010966 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 PyErr_SetString(PyExc_OverflowError,
10968 "replace string is too long");
10969 goto error;
10970 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010971 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010972 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010973 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010974 goto done;
10975 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010976 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 PyErr_SetString(PyExc_OverflowError,
10978 "replace string is too long");
10979 goto error;
10980 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010981 u = PyUnicode_New(new_size, maxchar);
10982 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010984 assert(PyUnicode_KIND(u) == rkind);
10985 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 ires = i = 0;
10987 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010988 while (n-- > 0) {
10989 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010990 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010991 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010992 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010993 if (j == -1)
10994 break;
10995 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010996 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010997 memcpy(res + rkind * ires,
10998 sbuf + rkind * i,
10999 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011001 }
11002 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011004 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011006 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011012 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011013 memcpy(res + rkind * ires,
11014 sbuf + rkind * i,
11015 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011016 }
11017 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011018 /* interleave */
11019 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011020 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011022 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011024 if (--n <= 0)
11025 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011026 memcpy(res + rkind * ires,
11027 sbuf + rkind * i,
11028 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 ires++;
11030 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011031 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011032 memcpy(res + rkind * ires,
11033 sbuf + rkind * i,
11034 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011035 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011036 }
11037
11038 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011039 unicode_adjust_maxchar(&u);
11040 if (u == NULL)
11041 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011043
11044 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011045 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11046 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11047 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011049 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011051 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011053 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011054 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011056
Benjamin Peterson29060642009-01-31 22:14:21 +000011057 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011058 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011059 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11060 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11061 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011063 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011065 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011067 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011068 return unicode_result_unchanged(self);
11069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011071 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11072 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11073 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11074 if (srelease)
11075 PyMem_FREE((void *)sbuf);
11076 if (release1)
11077 PyMem_FREE((void *)buf1);
11078 if (release2)
11079 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081}
11082
11083/* --- Unicode Object Methods --------------------------------------------- */
11084
INADA Naoki3ae20562017-01-16 20:41:20 +090011085/*[clinic input]
11086str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087
INADA Naoki3ae20562017-01-16 20:41:20 +090011088Return a version of the string where each word is titlecased.
11089
11090More specifically, words start with uppercased characters and all remaining
11091cased characters have lower case.
11092[clinic start generated code]*/
11093
11094static PyObject *
11095unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011096/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011098 if (PyUnicode_READY(self) == -1)
11099 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011100 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101}
11102
INADA Naoki3ae20562017-01-16 20:41:20 +090011103/*[clinic input]
11104str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011105
INADA Naoki3ae20562017-01-16 20:41:20 +090011106Return a capitalized version of the string.
11107
11108More specifically, make the first character have upper case and the rest lower
11109case.
11110[clinic start generated code]*/
11111
11112static PyObject *
11113unicode_capitalize_impl(PyObject *self)
11114/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011116 if (PyUnicode_READY(self) == -1)
11117 return NULL;
11118 if (PyUnicode_GET_LENGTH(self) == 0)
11119 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011120 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121}
11122
INADA Naoki3ae20562017-01-16 20:41:20 +090011123/*[clinic input]
11124str.casefold as unicode_casefold
11125
11126Return a version of the string suitable for caseless comparisons.
11127[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011128
11129static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011130unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011131/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011132{
11133 if (PyUnicode_READY(self) == -1)
11134 return NULL;
11135 if (PyUnicode_IS_ASCII(self))
11136 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011137 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011138}
11139
11140
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011141/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011142
11143static int
11144convert_uc(PyObject *obj, void *addr)
11145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011147
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011148 if (!PyUnicode_Check(obj)) {
11149 PyErr_Format(PyExc_TypeError,
11150 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011151 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011152 return 0;
11153 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011154 if (PyUnicode_READY(obj) < 0)
11155 return 0;
11156 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011157 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011159 return 0;
11160 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011161 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011162 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011163}
11164
INADA Naoki3ae20562017-01-16 20:41:20 +090011165/*[clinic input]
11166str.center as unicode_center
11167
11168 width: Py_ssize_t
11169 fillchar: Py_UCS4 = ' '
11170 /
11171
11172Return a centered string of length width.
11173
11174Padding is done using the specified fill character (default is a space).
11175[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176
11177static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011178unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11179/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011181 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Benjamin Petersonbac79492012-01-14 13:34:47 -050011183 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184 return NULL;
11185
Victor Stinnerc4b49542011-12-11 22:44:26 +010011186 if (PyUnicode_GET_LENGTH(self) >= width)
11187 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
Victor Stinnerc4b49542011-12-11 22:44:26 +010011189 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190 left = marg / 2 + (marg & width & 1);
11191
Victor Stinner9310abb2011-10-05 00:59:23 +020011192 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193}
11194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195/* This function assumes that str1 and str2 are readied by the caller. */
11196
Marc-André Lemburge5034372000-08-08 08:04:29 +000011197static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011198unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011199{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011200#define COMPARE(TYPE1, TYPE2) \
11201 do { \
11202 TYPE1* p1 = (TYPE1 *)data1; \
11203 TYPE2* p2 = (TYPE2 *)data2; \
11204 TYPE1* end = p1 + len; \
11205 Py_UCS4 c1, c2; \
11206 for (; p1 != end; p1++, p2++) { \
11207 c1 = *p1; \
11208 c2 = *p2; \
11209 if (c1 != c2) \
11210 return (c1 < c2) ? -1 : 1; \
11211 } \
11212 } \
11213 while (0)
11214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011216 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011217 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 kind1 = PyUnicode_KIND(str1);
11220 kind2 = PyUnicode_KIND(str2);
11221 data1 = PyUnicode_DATA(str1);
11222 data2 = PyUnicode_DATA(str2);
11223 len1 = PyUnicode_GET_LENGTH(str1);
11224 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011225 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011226
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011227 switch(kind1) {
11228 case PyUnicode_1BYTE_KIND:
11229 {
11230 switch(kind2) {
11231 case PyUnicode_1BYTE_KIND:
11232 {
11233 int cmp = memcmp(data1, data2, len);
11234 /* normalize result of memcmp() into the range [-1; 1] */
11235 if (cmp < 0)
11236 return -1;
11237 if (cmp > 0)
11238 return 1;
11239 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011240 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011241 case PyUnicode_2BYTE_KIND:
11242 COMPARE(Py_UCS1, Py_UCS2);
11243 break;
11244 case PyUnicode_4BYTE_KIND:
11245 COMPARE(Py_UCS1, Py_UCS4);
11246 break;
11247 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011248 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011249 }
11250 break;
11251 }
11252 case PyUnicode_2BYTE_KIND:
11253 {
11254 switch(kind2) {
11255 case PyUnicode_1BYTE_KIND:
11256 COMPARE(Py_UCS2, Py_UCS1);
11257 break;
11258 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011259 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011260 COMPARE(Py_UCS2, Py_UCS2);
11261 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011262 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011263 case PyUnicode_4BYTE_KIND:
11264 COMPARE(Py_UCS2, Py_UCS4);
11265 break;
11266 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011267 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011268 }
11269 break;
11270 }
11271 case PyUnicode_4BYTE_KIND:
11272 {
11273 switch(kind2) {
11274 case PyUnicode_1BYTE_KIND:
11275 COMPARE(Py_UCS4, Py_UCS1);
11276 break;
11277 case PyUnicode_2BYTE_KIND:
11278 COMPARE(Py_UCS4, Py_UCS2);
11279 break;
11280 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011281 {
11282#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11283 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11284 /* normalize result of wmemcmp() into the range [-1; 1] */
11285 if (cmp < 0)
11286 return -1;
11287 if (cmp > 0)
11288 return 1;
11289#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011290 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011291#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011292 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011293 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011295 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011296 }
11297 break;
11298 }
11299 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011300 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011301 }
11302
Victor Stinner770e19e2012-10-04 22:59:45 +020011303 if (len1 == len2)
11304 return 0;
11305 if (len1 < len2)
11306 return -1;
11307 else
11308 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011309
11310#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011311}
11312
Benjamin Peterson621b4302016-09-09 13:54:34 -070011313static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011314unicode_compare_eq(PyObject *str1, PyObject *str2)
11315{
11316 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011317 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011318 Py_ssize_t len;
11319 int cmp;
11320
Victor Stinnere5567ad2012-10-23 02:48:49 +020011321 len = PyUnicode_GET_LENGTH(str1);
11322 if (PyUnicode_GET_LENGTH(str2) != len)
11323 return 0;
11324 kind = PyUnicode_KIND(str1);
11325 if (PyUnicode_KIND(str2) != kind)
11326 return 0;
11327 data1 = PyUnicode_DATA(str1);
11328 data2 = PyUnicode_DATA(str2);
11329
11330 cmp = memcmp(data1, data2, len * kind);
11331 return (cmp == 0);
11332}
11333
11334
Alexander Belopolsky40018472011-02-26 01:02:56 +000011335int
11336PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11339 if (PyUnicode_READY(left) == -1 ||
11340 PyUnicode_READY(right) == -1)
11341 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011342
11343 /* a string is equal to itself */
11344 if (left == right)
11345 return 0;
11346
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011347 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011349 PyErr_Format(PyExc_TypeError,
11350 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011351 Py_TYPE(left)->tp_name,
11352 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353 return -1;
11354}
11355
Martin v. Löwis5b222132007-06-10 09:51:05 +000011356int
11357PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 Py_ssize_t i;
11360 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011362 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363
Victor Stinner910337b2011-10-03 03:20:16 +020011364 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011365 if (!PyUnicode_IS_READY(uni)) {
11366 const wchar_t *ws = _PyUnicode_WSTR(uni);
11367 /* Compare Unicode string and source character set string */
11368 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11369 if (chr != ustr[i])
11370 return (chr < ustr[i]) ? -1 : 1;
11371 }
11372 /* This check keeps Python strings that end in '\0' from comparing equal
11373 to C strings identical up to that point. */
11374 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11375 return 1; /* uni is longer */
11376 if (ustr[i])
11377 return -1; /* str is longer */
11378 return 0;
11379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011381 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011382 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011383 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011384 size_t len, len2 = strlen(str);
11385 int cmp;
11386
11387 len = Py_MIN(len1, len2);
11388 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011389 if (cmp != 0) {
11390 if (cmp < 0)
11391 return -1;
11392 else
11393 return 1;
11394 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011395 if (len1 > len2)
11396 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011397 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011398 return -1; /* str is longer */
11399 return 0;
11400 }
11401 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011402 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011403 /* Compare Unicode string and source character set string */
11404 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011405 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011406 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11407 /* This check keeps Python strings that end in '\0' from comparing equal
11408 to C strings identical up to that point. */
11409 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11410 return 1; /* uni is longer */
11411 if (str[i])
11412 return -1; /* str is longer */
11413 return 0;
11414 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011415}
11416
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011417static int
11418non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11419{
11420 size_t i, len;
11421 const wchar_t *p;
11422 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11423 if (strlen(str) != len)
11424 return 0;
11425 p = _PyUnicode_WSTR(unicode);
11426 assert(p);
11427 for (i = 0; i < len; i++) {
11428 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011429 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011430 return 0;
11431 }
11432 return 1;
11433}
11434
11435int
11436_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11437{
11438 size_t len;
11439 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011440 assert(str);
11441#ifndef NDEBUG
11442 for (const char *p = str; *p; p++) {
11443 assert((unsigned char)*p < 128);
11444 }
11445#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011446 if (PyUnicode_READY(unicode) == -1) {
11447 /* Memory error or bad data */
11448 PyErr_Clear();
11449 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11450 }
11451 if (!PyUnicode_IS_ASCII(unicode))
11452 return 0;
11453 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11454 return strlen(str) == len &&
11455 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11456}
11457
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011458int
11459_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11460{
11461 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011462
11463 assert(_PyUnicode_CHECK(left));
11464 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011465#ifndef NDEBUG
11466 for (const char *p = right->string; *p; p++) {
11467 assert((unsigned char)*p < 128);
11468 }
11469#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011470
11471 if (PyUnicode_READY(left) == -1) {
11472 /* memory error or bad data */
11473 PyErr_Clear();
11474 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11475 }
11476
11477 if (!PyUnicode_IS_ASCII(left))
11478 return 0;
11479
11480 right_uni = _PyUnicode_FromId(right); /* borrowed */
11481 if (right_uni == NULL) {
11482 /* memory error or bad data */
11483 PyErr_Clear();
11484 return _PyUnicode_EqualToASCIIString(left, right->string);
11485 }
11486
11487 if (left == right_uni)
11488 return 1;
11489
11490 if (PyUnicode_CHECK_INTERNED(left))
11491 return 0;
11492
Victor Stinner607b1022020-05-05 18:50:30 +020011493#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011494 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011495 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011496 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11497 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011498#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011499
11500 return unicode_compare_eq(left, right_uni);
11501}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011502
Alexander Belopolsky40018472011-02-26 01:02:56 +000011503PyObject *
11504PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011505{
11506 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011507
Victor Stinnere5567ad2012-10-23 02:48:49 +020011508 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11509 Py_RETURN_NOTIMPLEMENTED;
11510
11511 if (PyUnicode_READY(left) == -1 ||
11512 PyUnicode_READY(right) == -1)
11513 return NULL;
11514
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011515 if (left == right) {
11516 switch (op) {
11517 case Py_EQ:
11518 case Py_LE:
11519 case Py_GE:
11520 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011521 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011522 case Py_NE:
11523 case Py_LT:
11524 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011525 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011526 default:
11527 PyErr_BadArgument();
11528 return NULL;
11529 }
11530 }
11531 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011532 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011533 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011534 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011535 }
11536 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011537 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011538 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011539 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011540}
11541
Alexander Belopolsky40018472011-02-26 01:02:56 +000011542int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011543_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11544{
11545 return unicode_eq(aa, bb);
11546}
11547
11548int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011549PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011550{
Victor Stinner77282cb2013-04-14 19:22:47 +020011551 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011552 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011554 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011555
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011556 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011558 "'in <string>' requires string as left operand, not %.100s",
11559 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011560 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011561 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011562 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011563 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011564 if (ensure_unicode(str) < 0)
11565 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011568 kind2 = PyUnicode_KIND(substr);
11569 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011570 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011572 len2 = PyUnicode_GET_LENGTH(substr);
11573 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011574 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011575 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011576 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011577 if (len2 == 1) {
11578 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11579 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011580 return result;
11581 }
11582 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011583 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011584 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011585 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587
Victor Stinner77282cb2013-04-14 19:22:47 +020011588 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 case PyUnicode_1BYTE_KIND:
11590 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11591 break;
11592 case PyUnicode_2BYTE_KIND:
11593 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11594 break;
11595 case PyUnicode_4BYTE_KIND:
11596 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11597 break;
11598 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011599 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011601
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011602 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011603 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011604 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605
Guido van Rossum403d68b2000-03-13 15:55:09 +000011606 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011607}
11608
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609/* Concat to string or Unicode object giving a new Unicode object. */
11610
Alexander Belopolsky40018472011-02-26 01:02:56 +000011611PyObject *
11612PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011614 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011615 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011616 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011618 if (ensure_unicode(left) < 0)
11619 return NULL;
11620
11621 if (!PyUnicode_Check(right)) {
11622 PyErr_Format(PyExc_TypeError,
11623 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011624 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011625 return NULL;
11626 }
11627 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011631 PyObject *empty = unicode_get_empty(); // Borrowed reference
11632 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011633 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011634 }
11635 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011636 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011639 left_len = PyUnicode_GET_LENGTH(left);
11640 right_len = PyUnicode_GET_LENGTH(right);
11641 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011642 PyErr_SetString(PyExc_OverflowError,
11643 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011644 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011645 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011646 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011647
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011648 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11649 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011650 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011653 result = PyUnicode_New(new_len, maxchar);
11654 if (result == NULL)
11655 return NULL;
11656 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11657 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11658 assert(_PyUnicode_CheckConsistency(result, 1));
11659 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
Walter Dörwald1ab83302007-05-18 17:15:44 +000011662void
Victor Stinner23e56682011-10-03 03:54:37 +020011663PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011664{
Victor Stinner23e56682011-10-03 03:54:37 +020011665 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011666 Py_UCS4 maxchar, maxchar2;
11667 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011668
11669 if (p_left == NULL) {
11670 if (!PyErr_Occurred())
11671 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011672 return;
11673 }
Victor Stinner23e56682011-10-03 03:54:37 +020011674 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011675 if (right == NULL || left == NULL
11676 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011677 if (!PyErr_Occurred())
11678 PyErr_BadInternalCall();
11679 goto error;
11680 }
11681
Benjamin Petersonbac79492012-01-14 13:34:47 -050011682 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011683 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011684 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011685 goto error;
11686
Victor Stinner488fa492011-12-12 00:01:39 +010011687 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011688 PyObject *empty = unicode_get_empty(); // Borrowed reference
11689 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011690 Py_DECREF(left);
11691 Py_INCREF(right);
11692 *p_left = right;
11693 return;
11694 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011695 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011696 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011697 }
Victor Stinner488fa492011-12-12 00:01:39 +010011698
11699 left_len = PyUnicode_GET_LENGTH(left);
11700 right_len = PyUnicode_GET_LENGTH(right);
11701 if (left_len > PY_SSIZE_T_MAX - right_len) {
11702 PyErr_SetString(PyExc_OverflowError,
11703 "strings are too large to concat");
11704 goto error;
11705 }
11706 new_len = left_len + right_len;
11707
11708 if (unicode_modifiable(left)
11709 && PyUnicode_CheckExact(right)
11710 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011711 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11712 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011713 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011714 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011715 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11716 {
11717 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011718 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011719 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011720
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011721 /* copy 'right' into the newly allocated area of 'left' */
11722 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011723 }
Victor Stinner488fa492011-12-12 00:01:39 +010011724 else {
11725 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11726 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011727 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011728
Victor Stinner488fa492011-12-12 00:01:39 +010011729 /* Concat the two Unicode strings */
11730 res = PyUnicode_New(new_len, maxchar);
11731 if (res == NULL)
11732 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011733 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11734 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011735 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011736 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011737 }
11738 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011739 return;
11740
11741error:
Victor Stinner488fa492011-12-12 00:01:39 +010011742 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011743}
11744
11745void
11746PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11747{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011748 PyUnicode_Append(pleft, right);
11749 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011750}
11751
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011752/*
11753Wraps stringlib_parse_args_finds() and additionally ensures that the
11754first argument is a unicode object.
11755*/
11756
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011757static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011758parse_args_finds_unicode(const char * function_name, PyObject *args,
11759 PyObject **substring,
11760 Py_ssize_t *start, Py_ssize_t *end)
11761{
11762 if(stringlib_parse_args_finds(function_name, args, substring,
11763 start, end)) {
11764 if (ensure_unicode(*substring) < 0)
11765 return 0;
11766 return 1;
11767 }
11768 return 0;
11769}
11770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011771PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011774Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011775string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011776interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
11778static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011779unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011781 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011782 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011783 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011785 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011786 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011789 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 kind1 = PyUnicode_KIND(self);
11793 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011794 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011795 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 len1 = PyUnicode_GET_LENGTH(self);
11798 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011800 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011801 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011802
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011803 buf1 = PyUnicode_DATA(self);
11804 buf2 = PyUnicode_DATA(substring);
11805 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011806 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011807 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011808 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011809 }
11810 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 case PyUnicode_1BYTE_KIND:
11812 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011813 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 buf2, len2, PY_SSIZE_T_MAX
11815 );
11816 break;
11817 case PyUnicode_2BYTE_KIND:
11818 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011819 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 buf2, len2, PY_SSIZE_T_MAX
11821 );
11822 break;
11823 case PyUnicode_4BYTE_KIND:
11824 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011825 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 buf2, len2, PY_SSIZE_T_MAX
11827 );
11828 break;
11829 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011830 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 }
11832
11833 result = PyLong_FromSsize_t(iresult);
11834
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011835 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011836 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011837 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 return result;
11840}
11841
INADA Naoki3ae20562017-01-16 20:41:20 +090011842/*[clinic input]
11843str.encode as unicode_encode
11844
11845 encoding: str(c_default="NULL") = 'utf-8'
11846 The encoding in which to encode the string.
11847 errors: str(c_default="NULL") = 'strict'
11848 The error handling scheme to use for encoding errors.
11849 The default is 'strict' meaning that encoding errors raise a
11850 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11851 'xmlcharrefreplace' as well as any other name registered with
11852 codecs.register_error that can handle UnicodeEncodeErrors.
11853
11854Encode the string using the codec registered for encoding.
11855[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856
11857static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011858unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011859/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011861 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011862}
11863
INADA Naoki3ae20562017-01-16 20:41:20 +090011864/*[clinic input]
11865str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866
INADA Naoki3ae20562017-01-16 20:41:20 +090011867 tabsize: int = 8
11868
11869Return a copy where all tab characters are expanded using spaces.
11870
11871If tabsize is not given, a tab size of 8 characters is assumed.
11872[clinic start generated code]*/
11873
11874static PyObject *
11875unicode_expandtabs_impl(PyObject *self, int tabsize)
11876/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011878 Py_ssize_t i, j, line_pos, src_len, incr;
11879 Py_UCS4 ch;
11880 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011881 const void *src_data;
11882 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011883 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011884 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
Antoine Pitrou22425222011-10-04 19:10:51 +020011886 if (PyUnicode_READY(self) == -1)
11887 return NULL;
11888
Thomas Wouters7e474022000-07-16 12:04:32 +000011889 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011890 src_len = PyUnicode_GET_LENGTH(self);
11891 i = j = line_pos = 0;
11892 kind = PyUnicode_KIND(self);
11893 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011894 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011895 for (; i < src_len; i++) {
11896 ch = PyUnicode_READ(kind, src_data, i);
11897 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011898 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011900 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011902 goto overflow;
11903 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011905 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011909 goto overflow;
11910 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011912 if (ch == '\n' || ch == '\r')
11913 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011915 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011916 if (!found)
11917 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011918
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011920 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 if (!u)
11922 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011923 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924
Antoine Pitroue71d5742011-10-04 15:55:09 +020011925 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
Antoine Pitroue71d5742011-10-04 15:55:09 +020011927 for (; i < src_len; i++) {
11928 ch = PyUnicode_READ(kind, src_data, i);
11929 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011931 incr = tabsize - (line_pos % tabsize);
11932 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011933 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011934 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011936 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011938 line_pos++;
11939 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011940 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011941 if (ch == '\n' || ch == '\r')
11942 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011944 }
11945 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011946 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011947
Antoine Pitroue71d5742011-10-04 15:55:09 +020011948 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011949 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011953PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955\n\
11956Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011957such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958arguments start and end are interpreted as in slice notation.\n\
11959\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011960Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
11962static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011965 /* initialize variables to prevent gcc warning */
11966 PyObject *substring = NULL;
11967 Py_ssize_t start = 0;
11968 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011969 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011971 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011974 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011977 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (result == -2)
11980 return NULL;
11981
Christian Heimes217cfd12007-12-02 14:31:20 +000011982 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
11985static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011986unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011988 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011989 enum PyUnicode_Kind kind;
11990 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011991
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011992 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011993 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011995 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011996 if (PyUnicode_READY(self) == -1) {
11997 return NULL;
11998 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011999 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12000 PyErr_SetString(PyExc_IndexError, "string index out of range");
12001 return NULL;
12002 }
12003 kind = PyUnicode_KIND(self);
12004 data = PyUnicode_DATA(self);
12005 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012006 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007}
12008
Guido van Rossumc2504932007-09-18 19:42:40 +000012009/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012010 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012011static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012012unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012014 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012015
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012016#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012017 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012018#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 if (_PyUnicode_HASH(self) != -1)
12020 return _PyUnicode_HASH(self);
12021 if (PyUnicode_READY(self) == -1)
12022 return -1;
animalizea1d14252019-01-02 20:16:06 +080012023
Christian Heimes985ecdc2013-11-20 11:46:18 +010012024 x = _Py_HashBytes(PyUnicode_DATA(self),
12025 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012027 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028}
12029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012030PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012031 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032\n\
oldkaa0735f2018-02-02 16:52:55 +080012033Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012034such that sub is contained within S[start:end]. Optional\n\
12035arguments start and end are interpreted as in slice notation.\n\
12036\n\
12037Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038
12039static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012042 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012043 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012044 PyObject *substring = NULL;
12045 Py_ssize_t start = 0;
12046 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012048 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012051 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012054 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 if (result == -2)
12057 return NULL;
12058
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 if (result < 0) {
12060 PyErr_SetString(PyExc_ValueError, "substring not found");
12061 return NULL;
12062 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012063
Christian Heimes217cfd12007-12-02 14:31:20 +000012064 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065}
12066
INADA Naoki3ae20562017-01-16 20:41:20 +090012067/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012068str.isascii as unicode_isascii
12069
12070Return True if all characters in the string are ASCII, False otherwise.
12071
12072ASCII characters have code points in the range U+0000-U+007F.
12073Empty string is ASCII too.
12074[clinic start generated code]*/
12075
12076static PyObject *
12077unicode_isascii_impl(PyObject *self)
12078/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12079{
12080 if (PyUnicode_READY(self) == -1) {
12081 return NULL;
12082 }
12083 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12084}
12085
12086/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012087str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
INADA Naoki3ae20562017-01-16 20:41:20 +090012089Return True if the string is a lowercase string, False otherwise.
12090
12091A string is lowercase if all cased characters in the string are lowercase and
12092there is at least one cased character in the string.
12093[clinic start generated code]*/
12094
12095static PyObject *
12096unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012097/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 Py_ssize_t i, length;
12100 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012101 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102 int cased;
12103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (PyUnicode_READY(self) == -1)
12105 return NULL;
12106 length = PyUnicode_GET_LENGTH(self);
12107 kind = PyUnicode_KIND(self);
12108 data = PyUnicode_DATA(self);
12109
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (length == 1)
12112 return PyBool_FromLong(
12113 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012115 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012117 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012118
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 for (i = 0; i < length; i++) {
12121 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012122
Benjamin Peterson29060642009-01-31 22:14:21 +000012123 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012124 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012125 else if (!cased && Py_UNICODE_ISLOWER(ch))
12126 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012128 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129}
12130
INADA Naoki3ae20562017-01-16 20:41:20 +090012131/*[clinic input]
12132str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133
INADA Naoki3ae20562017-01-16 20:41:20 +090012134Return True if the string is an uppercase string, False otherwise.
12135
12136A string is uppercase if all cased characters in the string are uppercase and
12137there is at least one cased character in the string.
12138[clinic start generated code]*/
12139
12140static PyObject *
12141unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012142/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 Py_ssize_t i, length;
12145 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012146 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147 int cased;
12148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (PyUnicode_READY(self) == -1)
12150 return NULL;
12151 length = PyUnicode_GET_LENGTH(self);
12152 kind = PyUnicode_KIND(self);
12153 data = PyUnicode_DATA(self);
12154
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (length == 1)
12157 return PyBool_FromLong(
12158 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012160 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012162 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012163
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 for (i = 0; i < length; i++) {
12166 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012167
Benjamin Peterson29060642009-01-31 22:14:21 +000012168 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012169 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 else if (!cased && Py_UNICODE_ISUPPER(ch))
12171 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012173 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174}
12175
INADA Naoki3ae20562017-01-16 20:41:20 +090012176/*[clinic input]
12177str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
INADA Naoki3ae20562017-01-16 20:41:20 +090012179Return True if the string is a title-cased string, False otherwise.
12180
12181In a title-cased string, upper- and title-case characters may only
12182follow uncased characters and lowercase characters only cased ones.
12183[clinic start generated code]*/
12184
12185static PyObject *
12186unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012187/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 Py_ssize_t i, length;
12190 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012191 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192 int cased, previous_is_cased;
12193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (PyUnicode_READY(self) == -1)
12195 return NULL;
12196 length = PyUnicode_GET_LENGTH(self);
12197 kind = PyUnicode_KIND(self);
12198 data = PyUnicode_DATA(self);
12199
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (length == 1) {
12202 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12203 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12204 (Py_UNICODE_ISUPPER(ch) != 0));
12205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012207 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012209 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012210
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211 cased = 0;
12212 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 for (i = 0; i < length; i++) {
12214 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012215
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12217 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012218 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 previous_is_cased = 1;
12220 cased = 1;
12221 }
12222 else if (Py_UNICODE_ISLOWER(ch)) {
12223 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012224 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 previous_is_cased = 1;
12226 cased = 1;
12227 }
12228 else
12229 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012231 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232}
12233
INADA Naoki3ae20562017-01-16 20:41:20 +090012234/*[clinic input]
12235str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
INADA Naoki3ae20562017-01-16 20:41:20 +090012237Return True if the string is a whitespace string, False otherwise.
12238
12239A string is whitespace if all characters in the string are whitespace and there
12240is at least one character in the string.
12241[clinic start generated code]*/
12242
12243static PyObject *
12244unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012245/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 Py_ssize_t i, length;
12248 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012249 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250
12251 if (PyUnicode_READY(self) == -1)
12252 return NULL;
12253 length = PyUnicode_GET_LENGTH(self);
12254 kind = PyUnicode_KIND(self);
12255 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 if (length == 1)
12259 return PyBool_FromLong(
12260 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012262 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012264 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 for (i = 0; i < length; i++) {
12267 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012268 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012269 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012271 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272}
12273
INADA Naoki3ae20562017-01-16 20:41:20 +090012274/*[clinic input]
12275str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012276
INADA Naoki3ae20562017-01-16 20:41:20 +090012277Return True if the string is an alphabetic string, False otherwise.
12278
12279A string is alphabetic if all characters in the string are alphabetic and there
12280is at least one character in the string.
12281[clinic start generated code]*/
12282
12283static PyObject *
12284unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012285/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 Py_ssize_t i, length;
12288 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012289 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290
12291 if (PyUnicode_READY(self) == -1)
12292 return NULL;
12293 length = PyUnicode_GET_LENGTH(self);
12294 kind = PyUnicode_KIND(self);
12295 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012296
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012297 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 if (length == 1)
12299 return PyBool_FromLong(
12300 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012301
12302 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012304 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 for (i = 0; i < length; i++) {
12307 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012308 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012309 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012310 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012311}
12312
INADA Naoki3ae20562017-01-16 20:41:20 +090012313/*[clinic input]
12314str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012315
INADA Naoki3ae20562017-01-16 20:41:20 +090012316Return True if the string is an alpha-numeric string, False otherwise.
12317
12318A string is alpha-numeric if all characters in the string are alpha-numeric and
12319there is at least one character in the string.
12320[clinic start generated code]*/
12321
12322static PyObject *
12323unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012324/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012327 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 Py_ssize_t len, i;
12329
12330 if (PyUnicode_READY(self) == -1)
12331 return NULL;
12332
12333 kind = PyUnicode_KIND(self);
12334 data = PyUnicode_DATA(self);
12335 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012336
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012337 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 if (len == 1) {
12339 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12340 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12341 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012342
12343 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012345 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 for (i = 0; i < len; i++) {
12348 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012349 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012350 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012351 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012352 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012353}
12354
INADA Naoki3ae20562017-01-16 20:41:20 +090012355/*[clinic input]
12356str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357
INADA Naoki3ae20562017-01-16 20:41:20 +090012358Return True if the string is a decimal string, False otherwise.
12359
12360A string is a decimal string if all characters in the string are decimal and
12361there is at least one character in the string.
12362[clinic start generated code]*/
12363
12364static PyObject *
12365unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012366/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 Py_ssize_t i, length;
12369 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012370 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371
12372 if (PyUnicode_READY(self) == -1)
12373 return NULL;
12374 length = PyUnicode_GET_LENGTH(self);
12375 kind = PyUnicode_KIND(self);
12376 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 if (length == 1)
12380 return PyBool_FromLong(
12381 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012383 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012385 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 for (i = 0; i < length; i++) {
12388 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012389 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012391 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392}
12393
INADA Naoki3ae20562017-01-16 20:41:20 +090012394/*[clinic input]
12395str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396
INADA Naoki3ae20562017-01-16 20:41:20 +090012397Return True if the string is a digit string, False otherwise.
12398
12399A string is a digit string if all characters in the string are digits and there
12400is at least one character in the string.
12401[clinic start generated code]*/
12402
12403static PyObject *
12404unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012405/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 Py_ssize_t i, length;
12408 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012409 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410
12411 if (PyUnicode_READY(self) == -1)
12412 return NULL;
12413 length = PyUnicode_GET_LENGTH(self);
12414 kind = PyUnicode_KIND(self);
12415 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 if (length == 1) {
12419 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12420 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012423 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012425 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 for (i = 0; i < length; i++) {
12428 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012429 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012431 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432}
12433
INADA Naoki3ae20562017-01-16 20:41:20 +090012434/*[clinic input]
12435str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436
INADA Naoki3ae20562017-01-16 20:41:20 +090012437Return True if the string is a numeric string, False otherwise.
12438
12439A string is numeric if all characters in the string are numeric and there is at
12440least one character in the string.
12441[clinic start generated code]*/
12442
12443static PyObject *
12444unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012445/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 Py_ssize_t i, length;
12448 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012449 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450
12451 if (PyUnicode_READY(self) == -1)
12452 return NULL;
12453 length = PyUnicode_GET_LENGTH(self);
12454 kind = PyUnicode_KIND(self);
12455 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 if (length == 1)
12459 return PyBool_FromLong(
12460 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012462 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012464 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 for (i = 0; i < length; i++) {
12467 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012468 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012470 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471}
12472
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012473Py_ssize_t
12474_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012477 if (PyUnicode_READY(self) == -1)
12478 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012479
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012480 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012481 if (len == 0) {
12482 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 }
12485
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012486 int kind = PyUnicode_KIND(self);
12487 const void *data = PyUnicode_DATA(self);
12488 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012489 /* PEP 3131 says that the first character must be in
12490 XID_Start and subsequent characters in XID_Continue,
12491 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012493 letters, digits, underscore). However, given the current
12494 definition of XID_Start and XID_Continue, it is sufficient
12495 to check just for these, except that _ must be allowed
12496 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012497 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012498 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012499 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012500
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012501 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012502 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012503 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012504 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012505 }
12506 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012507 return i;
12508}
12509
12510int
12511PyUnicode_IsIdentifier(PyObject *self)
12512{
12513 if (PyUnicode_IS_READY(self)) {
12514 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12515 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12516 /* an empty string is not a valid identifier */
12517 return len && i == len;
12518 }
12519 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012520_Py_COMP_DIAG_PUSH
12521_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012522 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012523 if (len == 0) {
12524 /* an empty string is not a valid identifier */
12525 return 0;
12526 }
12527
12528 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012529 Py_UCS4 ch = wstr[i++];
12530#if SIZEOF_WCHAR_T == 2
12531 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12532 && i < len
12533 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12534 {
12535 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12536 i++;
12537 }
12538#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012539 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12540 return 0;
12541 }
12542
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012543 while (i < len) {
12544 ch = wstr[i++];
12545#if SIZEOF_WCHAR_T == 2
12546 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12547 && i < len
12548 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12549 {
12550 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12551 i++;
12552 }
12553#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012554 if (!_PyUnicode_IsXidContinue(ch)) {
12555 return 0;
12556 }
12557 }
12558 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012559_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012560 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012561}
12562
INADA Naoki3ae20562017-01-16 20:41:20 +090012563/*[clinic input]
12564str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012565
INADA Naoki3ae20562017-01-16 20:41:20 +090012566Return True if the string is a valid Python identifier, False otherwise.
12567
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012568Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012569such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012570[clinic start generated code]*/
12571
12572static PyObject *
12573unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012574/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012575{
12576 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12577}
12578
INADA Naoki3ae20562017-01-16 20:41:20 +090012579/*[clinic input]
12580str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012581
INADA Naoki3ae20562017-01-16 20:41:20 +090012582Return True if the string is printable, False otherwise.
12583
12584A string is printable if all of its characters are considered printable in
12585repr() or if it is empty.
12586[clinic start generated code]*/
12587
12588static PyObject *
12589unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012590/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 Py_ssize_t i, length;
12593 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012594 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595
12596 if (PyUnicode_READY(self) == -1)
12597 return NULL;
12598 length = PyUnicode_GET_LENGTH(self);
12599 kind = PyUnicode_KIND(self);
12600 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012601
12602 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 if (length == 1)
12604 return PyBool_FromLong(
12605 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 for (i = 0; i < length; i++) {
12608 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012609 Py_RETURN_FALSE;
12610 }
12611 }
12612 Py_RETURN_TRUE;
12613}
12614
INADA Naoki3ae20562017-01-16 20:41:20 +090012615/*[clinic input]
12616str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617
INADA Naoki3ae20562017-01-16 20:41:20 +090012618 iterable: object
12619 /
12620
12621Concatenate any number of strings.
12622
Martin Panter91a88662017-01-24 00:30:06 +000012623The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012624The result is returned as a new string.
12625
12626Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12627[clinic start generated code]*/
12628
12629static PyObject *
12630unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012631/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632{
INADA Naoki3ae20562017-01-16 20:41:20 +090012633 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634}
12635
Martin v. Löwis18e16552006-02-15 17:27:45 +000012636static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012637unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 if (PyUnicode_READY(self) == -1)
12640 return -1;
12641 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642}
12643
INADA Naoki3ae20562017-01-16 20:41:20 +090012644/*[clinic input]
12645str.ljust as unicode_ljust
12646
12647 width: Py_ssize_t
12648 fillchar: Py_UCS4 = ' '
12649 /
12650
12651Return a left-justified string of length width.
12652
12653Padding is done using the specified fill character (default is a space).
12654[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655
12656static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012657unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12658/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012660 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662
Victor Stinnerc4b49542011-12-11 22:44:26 +010012663 if (PyUnicode_GET_LENGTH(self) >= width)
12664 return unicode_result_unchanged(self);
12665
12666 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667}
12668
INADA Naoki3ae20562017-01-16 20:41:20 +090012669/*[clinic input]
12670str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671
INADA Naoki3ae20562017-01-16 20:41:20 +090012672Return a copy of the string converted to lowercase.
12673[clinic start generated code]*/
12674
12675static PyObject *
12676unicode_lower_impl(PyObject *self)
12677/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012679 if (PyUnicode_READY(self) == -1)
12680 return NULL;
12681 if (PyUnicode_IS_ASCII(self))
12682 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012683 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684}
12685
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012686#define LEFTSTRIP 0
12687#define RIGHTSTRIP 1
12688#define BOTHSTRIP 2
12689
12690/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012691static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012692
INADA Naoki3ae20562017-01-16 20:41:20 +090012693#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012694
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012695/* externally visible for str.strip(unicode) */
12696PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012697_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012698{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012699 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 int kind;
12701 Py_ssize_t i, j, len;
12702 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012703 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12706 return NULL;
12707
12708 kind = PyUnicode_KIND(self);
12709 data = PyUnicode_DATA(self);
12710 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012711 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12713 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012714 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012715
Benjamin Peterson14339b62009-01-31 16:36:08 +000012716 i = 0;
12717 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012718 while (i < len) {
12719 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12720 if (!BLOOM(sepmask, ch))
12721 break;
12722 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12723 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 i++;
12725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012726 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012727
Benjamin Peterson14339b62009-01-31 16:36:08 +000012728 j = len;
12729 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012730 j--;
12731 while (j >= i) {
12732 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12733 if (!BLOOM(sepmask, ch))
12734 break;
12735 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12736 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012738 }
12739
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012742
Victor Stinner7931d9a2011-11-04 00:22:48 +010012743 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744}
12745
12746PyObject*
12747PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12748{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012749 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012751 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752
Victor Stinnerde636f32011-10-01 03:55:54 +020012753 if (PyUnicode_READY(self) == -1)
12754 return NULL;
12755
Victor Stinner684d5fd2012-05-03 02:32:34 +020012756 length = PyUnicode_GET_LENGTH(self);
12757 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012758
Victor Stinner684d5fd2012-05-03 02:32:34 +020012759 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012760 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761
Victor Stinnerde636f32011-10-01 03:55:54 +020012762 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012763 PyErr_SetString(PyExc_IndexError, "string index out of range");
12764 return NULL;
12765 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012766 if (start >= length || end < start)
12767 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012768
Victor Stinner684d5fd2012-05-03 02:32:34 +020012769 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012770 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012771 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012772 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012773 }
12774 else {
12775 kind = PyUnicode_KIND(self);
12776 data = PyUnicode_1BYTE_DATA(self);
12777 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012778 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012779 length);
12780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782
12783static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012784do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 Py_ssize_t len, i, j;
12787
12788 if (PyUnicode_READY(self) == -1)
12789 return NULL;
12790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012792
Victor Stinnercc7af722013-04-09 22:39:24 +020012793 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012794 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012795
12796 i = 0;
12797 if (striptype != RIGHTSTRIP) {
12798 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012799 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012800 if (!_Py_ascii_whitespace[ch])
12801 break;
12802 i++;
12803 }
12804 }
12805
12806 j = len;
12807 if (striptype != LEFTSTRIP) {
12808 j--;
12809 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012810 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012811 if (!_Py_ascii_whitespace[ch])
12812 break;
12813 j--;
12814 }
12815 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012816 }
12817 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012818 else {
12819 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012820 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012821
Victor Stinnercc7af722013-04-09 22:39:24 +020012822 i = 0;
12823 if (striptype != RIGHTSTRIP) {
12824 while (i < len) {
12825 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12826 if (!Py_UNICODE_ISSPACE(ch))
12827 break;
12828 i++;
12829 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012830 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012831
12832 j = len;
12833 if (striptype != LEFTSTRIP) {
12834 j--;
12835 while (j >= i) {
12836 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12837 if (!Py_UNICODE_ISSPACE(ch))
12838 break;
12839 j--;
12840 }
12841 j++;
12842 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012843 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012844
Victor Stinner7931d9a2011-11-04 00:22:48 +010012845 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846}
12847
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012848
12849static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012850do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012851{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012852 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 if (PyUnicode_Check(sep))
12854 return _PyUnicode_XStrip(self, striptype, sep);
12855 else {
12856 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 "%s arg must be None or str",
12858 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012859 return NULL;
12860 }
12861 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012862
Benjamin Peterson14339b62009-01-31 16:36:08 +000012863 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012864}
12865
12866
INADA Naoki3ae20562017-01-16 20:41:20 +090012867/*[clinic input]
12868str.strip as unicode_strip
12869
12870 chars: object = None
12871 /
12872
Zachary Ware09895c22019-10-09 16:09:00 -050012873Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012874
12875If chars is given and not None, remove characters in chars instead.
12876[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012877
12878static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012879unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012880/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012881{
INADA Naoki3ae20562017-01-16 20:41:20 +090012882 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012883}
12884
12885
INADA Naoki3ae20562017-01-16 20:41:20 +090012886/*[clinic input]
12887str.lstrip as unicode_lstrip
12888
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012889 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012890 /
12891
12892Return a copy of the string with leading whitespace removed.
12893
12894If chars is given and not None, remove characters in chars instead.
12895[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012896
12897static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012898unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012899/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012900{
INADA Naoki3ae20562017-01-16 20:41:20 +090012901 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012902}
12903
12904
INADA Naoki3ae20562017-01-16 20:41:20 +090012905/*[clinic input]
12906str.rstrip as unicode_rstrip
12907
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012908 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012909 /
12910
12911Return a copy of the string with trailing whitespace removed.
12912
12913If chars is given and not None, remove characters in chars instead.
12914[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012915
12916static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012917unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012918/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012919{
INADA Naoki3ae20562017-01-16 20:41:20 +090012920 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012921}
12922
12923
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012925unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012927 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929
Serhiy Storchaka05997252013-01-26 12:14:02 +020012930 if (len < 1)
12931 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932
Victor Stinnerc4b49542011-12-11 22:44:26 +010012933 /* no repeat, return original string */
12934 if (len == 1)
12935 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012936
Benjamin Petersonbac79492012-01-14 13:34:47 -050012937 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 return NULL;
12939
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012940 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012941 PyErr_SetString(PyExc_OverflowError,
12942 "repeated string is too long");
12943 return NULL;
12944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012946
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012947 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948 if (!u)
12949 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012950 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012953 int kind = PyUnicode_KIND(str);
12954 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012955 if (kind == PyUnicode_1BYTE_KIND) {
12956 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012957 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012958 }
12959 else if (kind == PyUnicode_2BYTE_KIND) {
12960 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012961 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012962 ucs2[n] = fill_char;
12963 } else {
12964 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12965 assert(kind == PyUnicode_4BYTE_KIND);
12966 for (n = 0; n < len; ++n)
12967 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 }
12970 else {
12971 /* number of characters copied this far */
12972 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012973 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012975 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012977 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012979 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982 }
12983
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012984 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012985 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986}
12987
Alexander Belopolsky40018472011-02-26 01:02:56 +000012988PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012989PyUnicode_Replace(PyObject *str,
12990 PyObject *substr,
12991 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012992 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012994 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12995 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012996 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012997 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012998}
12999
INADA Naoki3ae20562017-01-16 20:41:20 +090013000/*[clinic input]
13001str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002
INADA Naoki3ae20562017-01-16 20:41:20 +090013003 old: unicode
13004 new: unicode
13005 count: Py_ssize_t = -1
13006 Maximum number of occurrences to replace.
13007 -1 (the default value) means replace all occurrences.
13008 /
13009
13010Return a copy with all occurrences of substring old replaced by new.
13011
13012If the optional argument count is given, only the first count occurrences are
13013replaced.
13014[clinic start generated code]*/
13015
13016static PyObject *
13017unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13018 Py_ssize_t count)
13019/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013021 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013022 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013023 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013024}
13025
sweeneydea81849b2020-04-22 17:05:48 -040013026/*[clinic input]
13027str.removeprefix as unicode_removeprefix
13028
13029 prefix: unicode
13030 /
13031
13032Return a str with the given prefix string removed if present.
13033
13034If the string starts with the prefix string, return string[len(prefix):].
13035Otherwise, return a copy of the original string.
13036[clinic start generated code]*/
13037
13038static PyObject *
13039unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13040/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13041{
13042 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13043 if (match == -1) {
13044 return NULL;
13045 }
13046 if (match) {
13047 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13048 PyUnicode_GET_LENGTH(self));
13049 }
13050 return unicode_result_unchanged(self);
13051}
13052
13053/*[clinic input]
13054str.removesuffix as unicode_removesuffix
13055
13056 suffix: unicode
13057 /
13058
13059Return a str with the given suffix string removed if present.
13060
13061If the string ends with the suffix string and that suffix is not empty,
13062return string[:-len(suffix)]. Otherwise, return a copy of the original
13063string.
13064[clinic start generated code]*/
13065
13066static PyObject *
13067unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13068/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13069{
13070 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13071 if (match == -1) {
13072 return NULL;
13073 }
13074 if (match) {
13075 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13076 - PyUnicode_GET_LENGTH(suffix));
13077 }
13078 return unicode_result_unchanged(self);
13079}
13080
Alexander Belopolsky40018472011-02-26 01:02:56 +000013081static PyObject *
13082unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013084 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 Py_ssize_t isize;
13086 Py_ssize_t osize, squote, dquote, i, o;
13087 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013088 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013089 const void *idata;
13090 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013093 return NULL;
13094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095 isize = PyUnicode_GET_LENGTH(unicode);
13096 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 /* Compute length of output, quote characters, and
13099 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013100 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 max = 127;
13102 squote = dquote = 0;
13103 ikind = PyUnicode_KIND(unicode);
13104 for (i = 0; i < isize; i++) {
13105 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013106 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013108 case '\'': squote++; break;
13109 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013111 incr = 2;
13112 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 default:
13114 /* Fast-path ASCII */
13115 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013116 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013118 ;
13119 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013122 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013124 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013126 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013128 if (osize > PY_SSIZE_T_MAX - incr) {
13129 PyErr_SetString(PyExc_OverflowError,
13130 "string is too long to generate repr");
13131 return NULL;
13132 }
13133 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 }
13135
13136 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013137 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013139 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 if (dquote)
13141 /* Both squote and dquote present. Use squote,
13142 and escape them */
13143 osize += squote;
13144 else
13145 quote = '"';
13146 }
Victor Stinner55c08782013-04-14 18:45:39 +020013147 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148
13149 repr = PyUnicode_New(osize, max);
13150 if (repr == NULL)
13151 return NULL;
13152 okind = PyUnicode_KIND(repr);
13153 odata = PyUnicode_DATA(repr);
13154
13155 PyUnicode_WRITE(okind, odata, 0, quote);
13156 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013157 if (unchanged) {
13158 _PyUnicode_FastCopyCharacters(repr, 1,
13159 unicode, 0,
13160 isize);
13161 }
13162 else {
13163 for (i = 0, o = 1; i < isize; i++) {
13164 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165
Victor Stinner55c08782013-04-14 18:45:39 +020013166 /* Escape quotes and backslashes */
13167 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013168 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013170 continue;
13171 }
13172
13173 /* Map special whitespace to '\t', \n', '\r' */
13174 if (ch == '\t') {
13175 PyUnicode_WRITE(okind, odata, o++, '\\');
13176 PyUnicode_WRITE(okind, odata, o++, 't');
13177 }
13178 else if (ch == '\n') {
13179 PyUnicode_WRITE(okind, odata, o++, '\\');
13180 PyUnicode_WRITE(okind, odata, o++, 'n');
13181 }
13182 else if (ch == '\r') {
13183 PyUnicode_WRITE(okind, odata, o++, '\\');
13184 PyUnicode_WRITE(okind, odata, o++, 'r');
13185 }
13186
13187 /* Map non-printable US ASCII to '\xhh' */
13188 else if (ch < ' ' || ch == 0x7F) {
13189 PyUnicode_WRITE(okind, odata, o++, '\\');
13190 PyUnicode_WRITE(okind, odata, o++, 'x');
13191 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13192 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13193 }
13194
13195 /* Copy ASCII characters as-is */
13196 else if (ch < 0x7F) {
13197 PyUnicode_WRITE(okind, odata, o++, ch);
13198 }
13199
13200 /* Non-ASCII characters */
13201 else {
13202 /* Map Unicode whitespace and control characters
13203 (categories Z* and C* except ASCII space)
13204 */
13205 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13206 PyUnicode_WRITE(okind, odata, o++, '\\');
13207 /* Map 8-bit characters to '\xhh' */
13208 if (ch <= 0xff) {
13209 PyUnicode_WRITE(okind, odata, o++, 'x');
13210 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13211 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13212 }
13213 /* Map 16-bit characters to '\uxxxx' */
13214 else if (ch <= 0xffff) {
13215 PyUnicode_WRITE(okind, odata, o++, 'u');
13216 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13217 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13218 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13219 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13220 }
13221 /* Map 21-bit characters to '\U00xxxxxx' */
13222 else {
13223 PyUnicode_WRITE(okind, odata, o++, 'U');
13224 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13225 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13226 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13227 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13228 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13229 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13230 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13231 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13232 }
13233 }
13234 /* Copy characters as-is */
13235 else {
13236 PyUnicode_WRITE(okind, odata, o++, ch);
13237 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013238 }
13239 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013242 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013243 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244}
13245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013246PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248\n\
13249Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013250such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251arguments start and end are interpreted as in slice notation.\n\
13252\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013253Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254
13255static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013258 /* initialize variables to prevent gcc warning */
13259 PyObject *substring = NULL;
13260 Py_ssize_t start = 0;
13261 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013262 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013264 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013267 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013269
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013270 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 if (result == -2)
13273 return NULL;
13274
Christian Heimes217cfd12007-12-02 14:31:20 +000013275 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013276}
13277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013278PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013281Return the highest index in S where substring sub is found,\n\
13282such that sub is contained within S[start:end]. Optional\n\
13283arguments start and end are interpreted as in slice notation.\n\
13284\n\
13285Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286
13287static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013290 /* initialize variables to prevent gcc warning */
13291 PyObject *substring = NULL;
13292 Py_ssize_t start = 0;
13293 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013294 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013296 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013299 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013302 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 if (result == -2)
13305 return NULL;
13306
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307 if (result < 0) {
13308 PyErr_SetString(PyExc_ValueError, "substring not found");
13309 return NULL;
13310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311
Christian Heimes217cfd12007-12-02 14:31:20 +000013312 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313}
13314
INADA Naoki3ae20562017-01-16 20:41:20 +090013315/*[clinic input]
13316str.rjust as unicode_rjust
13317
13318 width: Py_ssize_t
13319 fillchar: Py_UCS4 = ' '
13320 /
13321
13322Return a right-justified string of length width.
13323
13324Padding is done using the specified fill character (default is a space).
13325[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326
13327static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013328unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13329/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013331 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013332 return NULL;
13333
Victor Stinnerc4b49542011-12-11 22:44:26 +010013334 if (PyUnicode_GET_LENGTH(self) >= width)
13335 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336
Victor Stinnerc4b49542011-12-11 22:44:26 +010013337 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338}
13339
Alexander Belopolsky40018472011-02-26 01:02:56 +000013340PyObject *
13341PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013343 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013346 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347}
13348
INADA Naoki3ae20562017-01-16 20:41:20 +090013349/*[clinic input]
13350str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351
INADA Naoki3ae20562017-01-16 20:41:20 +090013352 sep: object = None
13353 The delimiter according which to split the string.
13354 None (the default value) means split according to any whitespace,
13355 and discard empty strings from the result.
13356 maxsplit: Py_ssize_t = -1
13357 Maximum number of splits to do.
13358 -1 (the default value) means no limit.
13359
13360Return a list of the words in the string, using sep as the delimiter string.
13361[clinic start generated code]*/
13362
13363static PyObject *
13364unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13365/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366{
INADA Naoki3ae20562017-01-16 20:41:20 +090013367 if (sep == Py_None)
13368 return split(self, NULL, maxsplit);
13369 if (PyUnicode_Check(sep))
13370 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013371
Victor Stinner998b8062018-09-12 00:23:25 +020013372 PyErr_Format(PyExc_TypeError,
13373 "must be str or None, not %.100s",
13374 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376}
13377
Thomas Wouters477c8d52006-05-27 19:21:47 +000013378PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013379PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013380{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013381 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013382 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013383 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013384 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013385
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013386 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013388
Victor Stinner14f8f022011-10-05 20:58:25 +020013389 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 len1 = PyUnicode_GET_LENGTH(str_obj);
13392 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013393 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013394 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013395 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013396 }
13397 buf1 = PyUnicode_DATA(str_obj);
13398 buf2 = PyUnicode_DATA(sep_obj);
13399 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013400 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013401 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013404
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013405 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013406 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013407 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13408 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13409 else
13410 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013411 break;
13412 case PyUnicode_2BYTE_KIND:
13413 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13414 break;
13415 case PyUnicode_4BYTE_KIND:
13416 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13417 break;
13418 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013419 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013421
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013422 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013423 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013424 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013425
13426 return out;
13427}
13428
13429
13430PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013431PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013432{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013433 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013434 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013435 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013437
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013438 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013440
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013441 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 len1 = PyUnicode_GET_LENGTH(str_obj);
13444 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013445 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013446 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013447 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013448 }
13449 buf1 = PyUnicode_DATA(str_obj);
13450 buf2 = PyUnicode_DATA(sep_obj);
13451 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013452 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013453 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013454 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013456
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013457 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013458 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013459 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13460 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13461 else
13462 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 break;
13464 case PyUnicode_2BYTE_KIND:
13465 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13466 break;
13467 case PyUnicode_4BYTE_KIND:
13468 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13469 break;
13470 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013471 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013473
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013474 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013475 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013476 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013477
13478 return out;
13479}
13480
INADA Naoki3ae20562017-01-16 20:41:20 +090013481/*[clinic input]
13482str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013483
INADA Naoki3ae20562017-01-16 20:41:20 +090013484 sep: object
13485 /
13486
13487Partition the string into three parts using the given separator.
13488
13489This will search for the separator in the string. If the separator is found,
13490returns a 3-tuple containing the part before the separator, the separator
13491itself, and the part after it.
13492
13493If the separator is not found, returns a 3-tuple containing the original string
13494and two empty strings.
13495[clinic start generated code]*/
13496
13497static PyObject *
13498unicode_partition(PyObject *self, PyObject *sep)
13499/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013500{
INADA Naoki3ae20562017-01-16 20:41:20 +090013501 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013502}
13503
INADA Naoki3ae20562017-01-16 20:41:20 +090013504/*[clinic input]
13505str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013506
INADA Naoki3ae20562017-01-16 20:41:20 +090013507Partition the string into three parts using the given separator.
13508
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013509This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013510the separator is found, returns a 3-tuple containing the part before the
13511separator, the separator itself, and the part after it.
13512
13513If the separator is not found, returns a 3-tuple containing two empty strings
13514and the original string.
13515[clinic start generated code]*/
13516
13517static PyObject *
13518unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013519/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013520{
INADA Naoki3ae20562017-01-16 20:41:20 +090013521 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013522}
13523
Alexander Belopolsky40018472011-02-26 01:02:56 +000013524PyObject *
13525PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013526{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013527 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013528 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013529
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013530 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013531}
13532
INADA Naoki3ae20562017-01-16 20:41:20 +090013533/*[clinic input]
13534str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013535
INADA Naoki3ae20562017-01-16 20:41:20 +090013536Return a list of the words in the string, using sep as the delimiter string.
13537
13538Splits are done starting at the end of the string and working to the front.
13539[clinic start generated code]*/
13540
13541static PyObject *
13542unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13543/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013544{
INADA Naoki3ae20562017-01-16 20:41:20 +090013545 if (sep == Py_None)
13546 return rsplit(self, NULL, maxsplit);
13547 if (PyUnicode_Check(sep))
13548 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013549
Victor Stinner998b8062018-09-12 00:23:25 +020013550 PyErr_Format(PyExc_TypeError,
13551 "must be str or None, not %.100s",
13552 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013553 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013554}
13555
INADA Naoki3ae20562017-01-16 20:41:20 +090013556/*[clinic input]
13557str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013558
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013559 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013560
13561Return a list of the lines in the string, breaking at line boundaries.
13562
13563Line breaks are not included in the resulting list unless keepends is given and
13564true.
13565[clinic start generated code]*/
13566
13567static PyObject *
13568unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013569/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013570{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013571 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572}
13573
13574static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013575PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013577 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013578}
13579
INADA Naoki3ae20562017-01-16 20:41:20 +090013580/*[clinic input]
13581str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582
INADA Naoki3ae20562017-01-16 20:41:20 +090013583Convert uppercase characters to lowercase and lowercase characters to uppercase.
13584[clinic start generated code]*/
13585
13586static PyObject *
13587unicode_swapcase_impl(PyObject *self)
13588/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013589{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013590 if (PyUnicode_READY(self) == -1)
13591 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013592 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593}
13594
Larry Hastings61272b72014-01-07 12:41:53 -080013595/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013596
Larry Hastings31826802013-10-19 00:09:25 -070013597@staticmethod
13598str.maketrans as unicode_maketrans
13599
13600 x: object
13601
13602 y: unicode=NULL
13603
13604 z: unicode=NULL
13605
13606 /
13607
13608Return a translation table usable for str.translate().
13609
13610If there is only one argument, it must be a dictionary mapping Unicode
13611ordinals (integers) or characters to Unicode ordinals, strings or None.
13612Character keys will be then converted to ordinals.
13613If there are two arguments, they must be strings of equal length, and
13614in the resulting dictionary, each character in x will be mapped to the
13615character at the same position in y. If there is a third argument, it
13616must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013617[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013618
Larry Hastings31826802013-10-19 00:09:25 -070013619static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013620unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013621/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013622{
Georg Brandlceee0772007-11-27 23:48:05 +000013623 PyObject *new = NULL, *key, *value;
13624 Py_ssize_t i = 0;
13625 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013626
Georg Brandlceee0772007-11-27 23:48:05 +000013627 new = PyDict_New();
13628 if (!new)
13629 return NULL;
13630 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013632 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013633
Georg Brandlceee0772007-11-27 23:48:05 +000013634 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013635 if (!PyUnicode_Check(x)) {
13636 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13637 "be a string if there is a second argument");
13638 goto err;
13639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013640 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013641 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13642 "arguments must have equal length");
13643 goto err;
13644 }
13645 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013646 x_kind = PyUnicode_KIND(x);
13647 y_kind = PyUnicode_KIND(y);
13648 x_data = PyUnicode_DATA(x);
13649 y_data = PyUnicode_DATA(y);
13650 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13651 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013652 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013653 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013654 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013655 if (!value) {
13656 Py_DECREF(key);
13657 goto err;
13658 }
Georg Brandlceee0772007-11-27 23:48:05 +000013659 res = PyDict_SetItem(new, key, value);
13660 Py_DECREF(key);
13661 Py_DECREF(value);
13662 if (res < 0)
13663 goto err;
13664 }
13665 /* create entries for deleting chars in z */
13666 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013667 z_kind = PyUnicode_KIND(z);
13668 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013669 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013671 if (!key)
13672 goto err;
13673 res = PyDict_SetItem(new, key, Py_None);
13674 Py_DECREF(key);
13675 if (res < 0)
13676 goto err;
13677 }
13678 }
13679 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013680 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013681 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013682
Georg Brandlceee0772007-11-27 23:48:05 +000013683 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013684 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013685 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13686 "to maketrans it must be a dict");
13687 goto err;
13688 }
13689 /* copy entries into the new dict, converting string keys to int keys */
13690 while (PyDict_Next(x, &i, &key, &value)) {
13691 if (PyUnicode_Check(key)) {
13692 /* convert string keys to integer keys */
13693 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013694 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013695 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13696 "table must be of length 1");
13697 goto err;
13698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 kind = PyUnicode_KIND(key);
13700 data = PyUnicode_DATA(key);
13701 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013702 if (!newkey)
13703 goto err;
13704 res = PyDict_SetItem(new, newkey, value);
13705 Py_DECREF(newkey);
13706 if (res < 0)
13707 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013708 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013709 /* just keep integer keys */
13710 if (PyDict_SetItem(new, key, value) < 0)
13711 goto err;
13712 } else {
13713 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13714 "be strings or integers");
13715 goto err;
13716 }
13717 }
13718 }
13719 return new;
13720 err:
13721 Py_DECREF(new);
13722 return NULL;
13723}
13724
INADA Naoki3ae20562017-01-16 20:41:20 +090013725/*[clinic input]
13726str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013727
INADA Naoki3ae20562017-01-16 20:41:20 +090013728 table: object
13729 Translation table, which must be a mapping of Unicode ordinals to
13730 Unicode ordinals, strings, or None.
13731 /
13732
13733Replace each character in the string using the given translation table.
13734
13735The table must implement lookup/indexing via __getitem__, for instance a
13736dictionary or list. If this operation raises LookupError, the character is
13737left untouched. Characters mapped to None are deleted.
13738[clinic start generated code]*/
13739
13740static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013741unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013742/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013744 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013745}
13746
INADA Naoki3ae20562017-01-16 20:41:20 +090013747/*[clinic input]
13748str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013749
INADA Naoki3ae20562017-01-16 20:41:20 +090013750Return a copy of the string converted to uppercase.
13751[clinic start generated code]*/
13752
13753static PyObject *
13754unicode_upper_impl(PyObject *self)
13755/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013757 if (PyUnicode_READY(self) == -1)
13758 return NULL;
13759 if (PyUnicode_IS_ASCII(self))
13760 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013761 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013762}
13763
INADA Naoki3ae20562017-01-16 20:41:20 +090013764/*[clinic input]
13765str.zfill as unicode_zfill
13766
13767 width: Py_ssize_t
13768 /
13769
13770Pad a numeric string with zeros on the left, to fill a field of the given width.
13771
13772The string is never truncated.
13773[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774
13775static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013776unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013777/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013779 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013780 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013782 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013783 Py_UCS4 chr;
13784
Benjamin Petersonbac79492012-01-14 13:34:47 -050013785 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013787
Victor Stinnerc4b49542011-12-11 22:44:26 +010013788 if (PyUnicode_GET_LENGTH(self) >= width)
13789 return unicode_result_unchanged(self);
13790
13791 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792
13793 u = pad(self, fill, 0, '0');
13794
Walter Dörwald068325e2002-04-15 13:36:47 +000013795 if (u == NULL)
13796 return NULL;
13797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013798 kind = PyUnicode_KIND(u);
13799 data = PyUnicode_DATA(u);
13800 chr = PyUnicode_READ(kind, data, fill);
13801
13802 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013804 PyUnicode_WRITE(kind, data, 0, chr);
13805 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806 }
13807
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013808 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013809 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013810}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811
13812#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013813static PyObject *
13814unicode__decimal2ascii(PyObject *self)
13815{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013816 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013817}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013818#endif
13819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013820PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013823Return True if S starts with the specified prefix, False otherwise.\n\
13824With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013825With optional end, stop comparing S at that position.\n\
13826prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013827
13828static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013829unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013831{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013832 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013833 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013834 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013835 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013836 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837
Jesus Ceaac451502011-04-20 17:09:23 +020013838 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013840 if (PyTuple_Check(subobj)) {
13841 Py_ssize_t i;
13842 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013843 substring = PyTuple_GET_ITEM(subobj, i);
13844 if (!PyUnicode_Check(substring)) {
13845 PyErr_Format(PyExc_TypeError,
13846 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013847 "not %.100s",
13848 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013849 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013850 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013851 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013852 if (result == -1)
13853 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013854 if (result) {
13855 Py_RETURN_TRUE;
13856 }
13857 }
13858 /* nothing matched */
13859 Py_RETURN_FALSE;
13860 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013861 if (!PyUnicode_Check(subobj)) {
13862 PyErr_Format(PyExc_TypeError,
13863 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013864 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013865 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013866 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013867 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013868 if (result == -1)
13869 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013870 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871}
13872
13873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013874PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013875 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013876\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013877Return True if S ends with the specified suffix, False otherwise.\n\
13878With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013879With optional end, stop comparing S at that position.\n\
13880suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881
13882static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013883unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013884 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013885{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013886 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013887 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013888 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013889 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013890 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891
Jesus Ceaac451502011-04-20 17:09:23 +020013892 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013893 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013894 if (PyTuple_Check(subobj)) {
13895 Py_ssize_t i;
13896 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013897 substring = PyTuple_GET_ITEM(subobj, i);
13898 if (!PyUnicode_Check(substring)) {
13899 PyErr_Format(PyExc_TypeError,
13900 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013901 "not %.100s",
13902 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013903 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013904 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013905 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013906 if (result == -1)
13907 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013908 if (result) {
13909 Py_RETURN_TRUE;
13910 }
13911 }
13912 Py_RETURN_FALSE;
13913 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013914 if (!PyUnicode_Check(subobj)) {
13915 PyErr_Format(PyExc_TypeError,
13916 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013917 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013918 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013919 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013920 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013921 if (result == -1)
13922 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013923 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013924}
13925
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013926static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013927_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013928{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013929 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13930 writer->data = PyUnicode_DATA(writer->buffer);
13931
13932 if (!writer->readonly) {
13933 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013934 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013935 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013936 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013937 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13938 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13939 writer->kind = PyUnicode_WCHAR_KIND;
13940 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13941
Victor Stinner8f674cc2013-04-17 23:02:17 +020013942 /* Copy-on-write mode: set buffer size to 0 so
13943 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13944 * next write. */
13945 writer->size = 0;
13946 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013947}
13948
Victor Stinnerd3f08822012-05-29 12:57:52 +020013949void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013950_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013951{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013952 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013953
13954 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013955 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013956
13957 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13958 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13959 writer->kind = PyUnicode_WCHAR_KIND;
13960 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013961}
13962
Inada Naoki770847a2019-06-24 12:30:24 +090013963// Initialize _PyUnicodeWriter with initial buffer
13964static inline void
13965_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13966{
13967 memset(writer, 0, sizeof(*writer));
13968 writer->buffer = buffer;
13969 _PyUnicodeWriter_Update(writer);
13970 writer->min_length = writer->size;
13971}
13972
Victor Stinnerd3f08822012-05-29 12:57:52 +020013973int
13974_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13975 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013976{
13977 Py_ssize_t newlen;
13978 PyObject *newbuffer;
13979
Victor Stinner2740e462016-09-06 16:58:36 -070013980 assert(maxchar <= MAX_UNICODE);
13981
Victor Stinnerca9381e2015-09-22 00:58:32 +020013982 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013983 assert((maxchar > writer->maxchar && length >= 0)
13984 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013985
Victor Stinner202fdca2012-05-07 12:47:02 +020013986 if (length > PY_SSIZE_T_MAX - writer->pos) {
13987 PyErr_NoMemory();
13988 return -1;
13989 }
13990 newlen = writer->pos + length;
13991
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013992 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013993
Victor Stinnerd3f08822012-05-29 12:57:52 +020013994 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013995 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013996 if (writer->overallocate
13997 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13998 /* overallocate to limit the number of realloc() */
13999 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014000 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014001 if (newlen < writer->min_length)
14002 newlen = writer->min_length;
14003
Victor Stinnerd3f08822012-05-29 12:57:52 +020014004 writer->buffer = PyUnicode_New(newlen, maxchar);
14005 if (writer->buffer == NULL)
14006 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014007 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014008 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014009 if (writer->overallocate
14010 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14011 /* overallocate to limit the number of realloc() */
14012 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014013 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014014 if (newlen < writer->min_length)
14015 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014016
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014017 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014018 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014019 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014020 newbuffer = PyUnicode_New(newlen, maxchar);
14021 if (newbuffer == NULL)
14022 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014023 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14024 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014025 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014026 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014027 }
14028 else {
14029 newbuffer = resize_compact(writer->buffer, newlen);
14030 if (newbuffer == NULL)
14031 return -1;
14032 }
14033 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014034 }
14035 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014036 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014037 newbuffer = PyUnicode_New(writer->size, maxchar);
14038 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014039 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014040 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14041 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014042 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014043 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014044 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014045 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014046
14047#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014048}
14049
Victor Stinnerca9381e2015-09-22 00:58:32 +020014050int
14051_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14052 enum PyUnicode_Kind kind)
14053{
14054 Py_UCS4 maxchar;
14055
14056 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14057 assert(writer->kind < kind);
14058
14059 switch (kind)
14060 {
14061 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14062 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14063 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14064 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014065 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014066 }
14067
14068 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14069}
14070
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014071static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014072_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014073{
Victor Stinner2740e462016-09-06 16:58:36 -070014074 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014075 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14076 return -1;
14077 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14078 writer->pos++;
14079 return 0;
14080}
14081
14082int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014083_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14084{
14085 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14086}
14087
14088int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014089_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14090{
14091 Py_UCS4 maxchar;
14092 Py_ssize_t len;
14093
14094 if (PyUnicode_READY(str) == -1)
14095 return -1;
14096 len = PyUnicode_GET_LENGTH(str);
14097 if (len == 0)
14098 return 0;
14099 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14100 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014101 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014102 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014103 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014104 Py_INCREF(str);
14105 writer->buffer = str;
14106 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014107 writer->pos += len;
14108 return 0;
14109 }
14110 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14111 return -1;
14112 }
14113 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14114 str, 0, len);
14115 writer->pos += len;
14116 return 0;
14117}
14118
Victor Stinnere215d962012-10-06 23:03:36 +020014119int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014120_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14121 Py_ssize_t start, Py_ssize_t end)
14122{
14123 Py_UCS4 maxchar;
14124 Py_ssize_t len;
14125
14126 if (PyUnicode_READY(str) == -1)
14127 return -1;
14128
14129 assert(0 <= start);
14130 assert(end <= PyUnicode_GET_LENGTH(str));
14131 assert(start <= end);
14132
14133 if (end == 0)
14134 return 0;
14135
14136 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14137 return _PyUnicodeWriter_WriteStr(writer, str);
14138
14139 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14140 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14141 else
14142 maxchar = writer->maxchar;
14143 len = end - start;
14144
14145 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14146 return -1;
14147
14148 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14149 str, start, len);
14150 writer->pos += len;
14151 return 0;
14152}
14153
14154int
Victor Stinner4a587072013-11-19 12:54:53 +010014155_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14156 const char *ascii, Py_ssize_t len)
14157{
14158 if (len == -1)
14159 len = strlen(ascii);
14160
Andy Lestere6be9b52020-02-11 20:28:35 -060014161 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014162
14163 if (writer->buffer == NULL && !writer->overallocate) {
14164 PyObject *str;
14165
14166 str = _PyUnicode_FromASCII(ascii, len);
14167 if (str == NULL)
14168 return -1;
14169
14170 writer->readonly = 1;
14171 writer->buffer = str;
14172 _PyUnicodeWriter_Update(writer);
14173 writer->pos += len;
14174 return 0;
14175 }
14176
14177 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14178 return -1;
14179
14180 switch (writer->kind)
14181 {
14182 case PyUnicode_1BYTE_KIND:
14183 {
14184 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14185 Py_UCS1 *data = writer->data;
14186
Christian Heimesf051e432016-09-13 20:22:02 +020014187 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014188 break;
14189 }
14190 case PyUnicode_2BYTE_KIND:
14191 {
14192 _PyUnicode_CONVERT_BYTES(
14193 Py_UCS1, Py_UCS2,
14194 ascii, ascii + len,
14195 (Py_UCS2 *)writer->data + writer->pos);
14196 break;
14197 }
14198 case PyUnicode_4BYTE_KIND:
14199 {
14200 _PyUnicode_CONVERT_BYTES(
14201 Py_UCS1, Py_UCS4,
14202 ascii, ascii + len,
14203 (Py_UCS4 *)writer->data + writer->pos);
14204 break;
14205 }
14206 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014207 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014208 }
14209
14210 writer->pos += len;
14211 return 0;
14212}
14213
14214int
14215_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14216 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014217{
14218 Py_UCS4 maxchar;
14219
Andy Lestere6be9b52020-02-11 20:28:35 -060014220 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014221 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14222 return -1;
14223 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14224 writer->pos += len;
14225 return 0;
14226}
14227
Victor Stinnerd3f08822012-05-29 12:57:52 +020014228PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014229_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014230{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014231 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014232
Victor Stinnerd3f08822012-05-29 12:57:52 +020014233 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014234 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014235 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014236 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014237
14238 str = writer->buffer;
14239 writer->buffer = NULL;
14240
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014241 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014242 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14243 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014244 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014245
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014246 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14247 PyObject *str2;
14248 str2 = resize_compact(str, writer->pos);
14249 if (str2 == NULL) {
14250 Py_DECREF(str);
14251 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014252 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014253 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014254 }
14255
Victor Stinner15a0bd32013-07-08 22:29:55 +020014256 assert(_PyUnicode_CheckConsistency(str, 1));
14257 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014258}
14259
Victor Stinnerd3f08822012-05-29 12:57:52 +020014260void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014261_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014262{
14263 Py_CLEAR(writer->buffer);
14264}
14265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014266#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014267
14268PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014269 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014270\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014271Return a formatted version of S, using substitutions from args and kwargs.\n\
14272The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014273
Eric Smith27bbca62010-11-04 17:06:58 +000014274PyDoc_STRVAR(format_map__doc__,
14275 "S.format_map(mapping) -> str\n\
14276\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014277Return a formatted version of S, using substitutions from mapping.\n\
14278The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014279
INADA Naoki3ae20562017-01-16 20:41:20 +090014280/*[clinic input]
14281str.__format__ as unicode___format__
14282
14283 format_spec: unicode
14284 /
14285
14286Return a formatted version of the string as described by format_spec.
14287[clinic start generated code]*/
14288
Eric Smith4a7d76d2008-05-30 18:10:19 +000014289static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014290unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014291/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014292{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014293 _PyUnicodeWriter writer;
14294 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014295
Victor Stinnerd3f08822012-05-29 12:57:52 +020014296 if (PyUnicode_READY(self) == -1)
14297 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014298 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014299 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14300 self, format_spec, 0,
14301 PyUnicode_GET_LENGTH(format_spec));
14302 if (ret == -1) {
14303 _PyUnicodeWriter_Dealloc(&writer);
14304 return NULL;
14305 }
14306 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014307}
14308
INADA Naoki3ae20562017-01-16 20:41:20 +090014309/*[clinic input]
14310str.__sizeof__ as unicode_sizeof
14311
14312Return the size of the string in memory, in bytes.
14313[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014314
14315static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014316unicode_sizeof_impl(PyObject *self)
14317/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014318{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014319 Py_ssize_t size;
14320
14321 /* If it's a compact object, account for base structure +
14322 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014323 if (PyUnicode_IS_COMPACT_ASCII(self))
14324 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14325 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014326 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014327 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014328 else {
14329 /* If it is a two-block object, account for base object, and
14330 for character block if present. */
14331 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014332 if (_PyUnicode_DATA_ANY(self))
14333 size += (PyUnicode_GET_LENGTH(self) + 1) *
14334 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014335 }
14336 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014337 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014338 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14339 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14340 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14341 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014342
14343 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014344}
14345
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014346static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014347unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014348{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014349 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014350 if (!copy)
14351 return NULL;
14352 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014353}
14354
Guido van Rossumd57fd912000-03-10 22:53:23 +000014355static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014356 UNICODE_ENCODE_METHODDEF
14357 UNICODE_REPLACE_METHODDEF
14358 UNICODE_SPLIT_METHODDEF
14359 UNICODE_RSPLIT_METHODDEF
14360 UNICODE_JOIN_METHODDEF
14361 UNICODE_CAPITALIZE_METHODDEF
14362 UNICODE_CASEFOLD_METHODDEF
14363 UNICODE_TITLE_METHODDEF
14364 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014365 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014366 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014367 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014368 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014369 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014370 UNICODE_LJUST_METHODDEF
14371 UNICODE_LOWER_METHODDEF
14372 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014373 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14374 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014375 UNICODE_RJUST_METHODDEF
14376 UNICODE_RSTRIP_METHODDEF
14377 UNICODE_RPARTITION_METHODDEF
14378 UNICODE_SPLITLINES_METHODDEF
14379 UNICODE_STRIP_METHODDEF
14380 UNICODE_SWAPCASE_METHODDEF
14381 UNICODE_TRANSLATE_METHODDEF
14382 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014383 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14384 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014385 UNICODE_REMOVEPREFIX_METHODDEF
14386 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014387 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014388 UNICODE_ISLOWER_METHODDEF
14389 UNICODE_ISUPPER_METHODDEF
14390 UNICODE_ISTITLE_METHODDEF
14391 UNICODE_ISSPACE_METHODDEF
14392 UNICODE_ISDECIMAL_METHODDEF
14393 UNICODE_ISDIGIT_METHODDEF
14394 UNICODE_ISNUMERIC_METHODDEF
14395 UNICODE_ISALPHA_METHODDEF
14396 UNICODE_ISALNUM_METHODDEF
14397 UNICODE_ISIDENTIFIER_METHODDEF
14398 UNICODE_ISPRINTABLE_METHODDEF
14399 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014400 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014401 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014402 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014403 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014404 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014405#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014406 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014407 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014408#endif
14409
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014410 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014411 {NULL, NULL}
14412};
14413
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014414static PyObject *
14415unicode_mod(PyObject *v, PyObject *w)
14416{
Brian Curtindfc80e32011-08-10 20:28:54 -050014417 if (!PyUnicode_Check(v))
14418 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014419 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014420}
14421
14422static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 0, /*nb_add*/
14424 0, /*nb_subtract*/
14425 0, /*nb_multiply*/
14426 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014427};
14428
Guido van Rossumd57fd912000-03-10 22:53:23 +000014429static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014430 (lenfunc) unicode_length, /* sq_length */
14431 PyUnicode_Concat, /* sq_concat */
14432 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14433 (ssizeargfunc) unicode_getitem, /* sq_item */
14434 0, /* sq_slice */
14435 0, /* sq_ass_item */
14436 0, /* sq_ass_slice */
14437 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014438};
14439
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014440static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014441unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014443 if (PyUnicode_READY(self) == -1)
14444 return NULL;
14445
Victor Stinnera15e2602020-04-08 02:01:56 +020014446 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014447 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014448 if (i == -1 && PyErr_Occurred())
14449 return NULL;
14450 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014451 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014452 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014453 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014454 Py_ssize_t start, stop, step, slicelength, i;
14455 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014456 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014457 const void *src_data;
14458 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014459 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014460 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014461
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014462 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014463 return NULL;
14464 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014465 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14466 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014467
14468 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014469 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014470 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014471 slicelength == PyUnicode_GET_LENGTH(self)) {
14472 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014473 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014474 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014475 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014476 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014477 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014478 src_kind = PyUnicode_KIND(self);
14479 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014480 if (!PyUnicode_IS_ASCII(self)) {
14481 kind_limit = kind_maxchar_limit(src_kind);
14482 max_char = 0;
14483 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14484 ch = PyUnicode_READ(src_kind, src_data, cur);
14485 if (ch > max_char) {
14486 max_char = ch;
14487 if (max_char >= kind_limit)
14488 break;
14489 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014490 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014491 }
Victor Stinner55c99112011-10-13 01:17:06 +020014492 else
14493 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014494 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014495 if (result == NULL)
14496 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014497 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014498 dest_data = PyUnicode_DATA(result);
14499
14500 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014501 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14502 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014503 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014504 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014505 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014506 } else {
14507 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14508 return NULL;
14509 }
14510}
14511
14512static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014513 (lenfunc)unicode_length, /* mp_length */
14514 (binaryfunc)unicode_subscript, /* mp_subscript */
14515 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014516};
14517
Guido van Rossumd57fd912000-03-10 22:53:23 +000014518
Guido van Rossumd57fd912000-03-10 22:53:23 +000014519/* Helpers for PyUnicode_Format() */
14520
Victor Stinnera47082312012-10-04 02:19:54 +020014521struct unicode_formatter_t {
14522 PyObject *args;
14523 int args_owned;
14524 Py_ssize_t arglen, argidx;
14525 PyObject *dict;
14526
14527 enum PyUnicode_Kind fmtkind;
14528 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014529 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014530 PyObject *fmtstr;
14531
14532 _PyUnicodeWriter writer;
14533};
14534
14535struct unicode_format_arg_t {
14536 Py_UCS4 ch;
14537 int flags;
14538 Py_ssize_t width;
14539 int prec;
14540 int sign;
14541};
14542
Guido van Rossumd57fd912000-03-10 22:53:23 +000014543static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014544unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014545{
Victor Stinnera47082312012-10-04 02:19:54 +020014546 Py_ssize_t argidx = ctx->argidx;
14547
14548 if (argidx < ctx->arglen) {
14549 ctx->argidx++;
14550 if (ctx->arglen < 0)
14551 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014552 else
Victor Stinnera47082312012-10-04 02:19:54 +020014553 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014554 }
14555 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014556 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014557 return NULL;
14558}
14559
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014560/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014561
Victor Stinnera47082312012-10-04 02:19:54 +020014562/* Format a float into the writer if the writer is not NULL, or into *p_output
14563 otherwise.
14564
14565 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014566static int
Victor Stinnera47082312012-10-04 02:19:54 +020014567formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14568 PyObject **p_output,
14569 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014570{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014571 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014572 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014573 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014574 int prec;
14575 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014576
Guido van Rossumd57fd912000-03-10 22:53:23 +000014577 x = PyFloat_AsDouble(v);
14578 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014579 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014580
Victor Stinnera47082312012-10-04 02:19:54 +020014581 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014582 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014583 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014584
Victor Stinnera47082312012-10-04 02:19:54 +020014585 if (arg->flags & F_ALT)
14586 dtoa_flags = Py_DTSF_ALT;
14587 else
14588 dtoa_flags = 0;
14589 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014590 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014591 return -1;
14592 len = strlen(p);
14593 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014594 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014595 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014596 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014597 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014598 }
14599 else
14600 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014601 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014602 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014603}
14604
Victor Stinnerd0880d52012-04-27 23:40:13 +020014605/* formatlong() emulates the format codes d, u, o, x and X, and
14606 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14607 * Python's regular ints.
14608 * Return value: a new PyUnicodeObject*, or NULL if error.
14609 * The output string is of the form
14610 * "-"? ("0x" | "0X")? digit+
14611 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14612 * set in flags. The case of hex digits will be correct,
14613 * There will be at least prec digits, zero-filled on the left if
14614 * necessary to get that many.
14615 * val object to be converted
14616 * flags bitmask of format flags; only F_ALT is looked at
14617 * prec minimum number of digits; 0-fill on left if needed
14618 * type a character in [duoxX]; u acts the same as d
14619 *
14620 * CAUTION: o, x and X conversions on regular ints can never
14621 * produce a '-' sign, but can for Python's unbounded ints.
14622 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014623PyObject *
14624_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014625{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014626 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014627 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014628 Py_ssize_t i;
14629 int sign; /* 1 if '-', else 0 */
14630 int len; /* number of characters */
14631 Py_ssize_t llen;
14632 int numdigits; /* len == numnondigits + numdigits */
14633 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014634
Victor Stinnerd0880d52012-04-27 23:40:13 +020014635 /* Avoid exceeding SSIZE_T_MAX */
14636 if (prec > INT_MAX-3) {
14637 PyErr_SetString(PyExc_OverflowError,
14638 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014639 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014640 }
14641
14642 assert(PyLong_Check(val));
14643
14644 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014645 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014646 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014647 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014648 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014649 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014650 /* int and int subclasses should print numerically when a numeric */
14651 /* format code is used (see issue18780) */
14652 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014653 break;
14654 case 'o':
14655 numnondigits = 2;
14656 result = PyNumber_ToBase(val, 8);
14657 break;
14658 case 'x':
14659 case 'X':
14660 numnondigits = 2;
14661 result = PyNumber_ToBase(val, 16);
14662 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014663 }
14664 if (!result)
14665 return NULL;
14666
14667 assert(unicode_modifiable(result));
14668 assert(PyUnicode_IS_READY(result));
14669 assert(PyUnicode_IS_ASCII(result));
14670
14671 /* To modify the string in-place, there can only be one reference. */
14672 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014673 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014674 PyErr_BadInternalCall();
14675 return NULL;
14676 }
14677 buf = PyUnicode_DATA(result);
14678 llen = PyUnicode_GET_LENGTH(result);
14679 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014680 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014681 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014682 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014683 return NULL;
14684 }
14685 len = (int)llen;
14686 sign = buf[0] == '-';
14687 numnondigits += sign;
14688 numdigits = len - numnondigits;
14689 assert(numdigits > 0);
14690
14691 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014692 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014693 (type == 'o' || type == 'x' || type == 'X'))) {
14694 assert(buf[sign] == '0');
14695 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14696 buf[sign+1] == 'o');
14697 numnondigits -= 2;
14698 buf += 2;
14699 len -= 2;
14700 if (sign)
14701 buf[0] = '-';
14702 assert(len == numnondigits + numdigits);
14703 assert(numdigits > 0);
14704 }
14705
14706 /* Fill with leading zeroes to meet minimum width. */
14707 if (prec > numdigits) {
14708 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14709 numnondigits + prec);
14710 char *b1;
14711 if (!r1) {
14712 Py_DECREF(result);
14713 return NULL;
14714 }
14715 b1 = PyBytes_AS_STRING(r1);
14716 for (i = 0; i < numnondigits; ++i)
14717 *b1++ = *buf++;
14718 for (i = 0; i < prec - numdigits; i++)
14719 *b1++ = '0';
14720 for (i = 0; i < numdigits; i++)
14721 *b1++ = *buf++;
14722 *b1 = '\0';
14723 Py_DECREF(result);
14724 result = r1;
14725 buf = PyBytes_AS_STRING(result);
14726 len = numnondigits + prec;
14727 }
14728
14729 /* Fix up case for hex conversions. */
14730 if (type == 'X') {
14731 /* Need to convert all lower case letters to upper case.
14732 and need to convert 0x to 0X (and -0x to -0X). */
14733 for (i = 0; i < len; i++)
14734 if (buf[i] >= 'a' && buf[i] <= 'x')
14735 buf[i] -= 'a'-'A';
14736 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014737 if (!PyUnicode_Check(result)
14738 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014739 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014740 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014741 Py_DECREF(result);
14742 result = unicode;
14743 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014744 else if (len != PyUnicode_GET_LENGTH(result)) {
14745 if (PyUnicode_Resize(&result, len) < 0)
14746 Py_CLEAR(result);
14747 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014748 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014749}
14750
Ethan Furmandf3ed242014-01-05 06:50:30 -080014751/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014752 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014753 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014754 * -1 and raise an exception on error */
14755static int
Victor Stinnera47082312012-10-04 02:19:54 +020014756mainformatlong(PyObject *v,
14757 struct unicode_format_arg_t *arg,
14758 PyObject **p_output,
14759 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014760{
14761 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014762 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014763
14764 if (!PyNumber_Check(v))
14765 goto wrongtype;
14766
Ethan Furman9ab74802014-03-21 06:38:46 -070014767 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014768 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014769 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014770 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014771 }
14772 else {
14773 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014774 }
14775 if (iobj == NULL ) {
14776 if (PyErr_ExceptionMatches(PyExc_TypeError))
14777 goto wrongtype;
14778 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014779 }
14780 assert(PyLong_Check(iobj));
14781 }
14782 else {
14783 iobj = v;
14784 Py_INCREF(iobj);
14785 }
14786
14787 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014788 && arg->width == -1 && arg->prec == -1
14789 && !(arg->flags & (F_SIGN | F_BLANK))
14790 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014791 {
14792 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014793 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014794 int base;
14795
Victor Stinnera47082312012-10-04 02:19:54 +020014796 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014797 {
14798 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014799 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014800 case 'd':
14801 case 'i':
14802 case 'u':
14803 base = 10;
14804 break;
14805 case 'o':
14806 base = 8;
14807 break;
14808 case 'x':
14809 case 'X':
14810 base = 16;
14811 break;
14812 }
14813
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014814 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14815 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014816 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014817 }
14818 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014819 return 1;
14820 }
14821
Ethan Furmanb95b5612015-01-23 20:05:18 -080014822 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014823 Py_DECREF(iobj);
14824 if (res == NULL)
14825 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014826 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014827 return 0;
14828
14829wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014830 switch(type)
14831 {
14832 case 'o':
14833 case 'x':
14834 case 'X':
14835 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014836 "%%%c format: an integer is required, "
14837 "not %.200s",
14838 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014839 break;
14840 default:
14841 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014842 "%%%c format: a number is required, "
14843 "not %.200s",
14844 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014845 break;
14846 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014847 return -1;
14848}
14849
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014850static Py_UCS4
14851formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014852{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014853 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014854 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014855 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014856 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014857 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014858 goto onError;
14859 }
14860 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014861 int overflow;
14862 long x = PyLong_AsLongAndOverflow(v, &overflow);
14863 if (x == -1 && PyErr_Occurred()) {
14864 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014865 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014866 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014867 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014868 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014869
Victor Stinner8faf8212011-12-08 22:14:11 +010014870 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014871 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014872 PyErr_SetString(PyExc_OverflowError,
14873 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014874 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014875 }
14876
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014877 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014878 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014879
Benjamin Peterson29060642009-01-31 22:14:21 +000014880 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014881 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014882 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014883 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014884}
14885
Victor Stinnera47082312012-10-04 02:19:54 +020014886/* Parse options of an argument: flags, width, precision.
14887 Handle also "%(name)" syntax.
14888
14889 Return 0 if the argument has been formatted into arg->str.
14890 Return 1 if the argument has been written into ctx->writer,
14891 Raise an exception and return -1 on error. */
14892static int
14893unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14894 struct unicode_format_arg_t *arg)
14895{
14896#define FORMAT_READ(ctx) \
14897 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14898
14899 PyObject *v;
14900
Victor Stinnera47082312012-10-04 02:19:54 +020014901 if (arg->ch == '(') {
14902 /* Get argument value from a dictionary. Example: "%(name)s". */
14903 Py_ssize_t keystart;
14904 Py_ssize_t keylen;
14905 PyObject *key;
14906 int pcount = 1;
14907
14908 if (ctx->dict == NULL) {
14909 PyErr_SetString(PyExc_TypeError,
14910 "format requires a mapping");
14911 return -1;
14912 }
14913 ++ctx->fmtpos;
14914 --ctx->fmtcnt;
14915 keystart = ctx->fmtpos;
14916 /* Skip over balanced parentheses */
14917 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14918 arg->ch = FORMAT_READ(ctx);
14919 if (arg->ch == ')')
14920 --pcount;
14921 else if (arg->ch == '(')
14922 ++pcount;
14923 ctx->fmtpos++;
14924 }
14925 keylen = ctx->fmtpos - keystart - 1;
14926 if (ctx->fmtcnt < 0 || pcount > 0) {
14927 PyErr_SetString(PyExc_ValueError,
14928 "incomplete format key");
14929 return -1;
14930 }
14931 key = PyUnicode_Substring(ctx->fmtstr,
14932 keystart, keystart + keylen);
14933 if (key == NULL)
14934 return -1;
14935 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014936 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014937 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014938 }
14939 ctx->args = PyObject_GetItem(ctx->dict, key);
14940 Py_DECREF(key);
14941 if (ctx->args == NULL)
14942 return -1;
14943 ctx->args_owned = 1;
14944 ctx->arglen = -1;
14945 ctx->argidx = -2;
14946 }
14947
14948 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014949 while (--ctx->fmtcnt >= 0) {
14950 arg->ch = FORMAT_READ(ctx);
14951 ctx->fmtpos++;
14952 switch (arg->ch) {
14953 case '-': arg->flags |= F_LJUST; continue;
14954 case '+': arg->flags |= F_SIGN; continue;
14955 case ' ': arg->flags |= F_BLANK; continue;
14956 case '#': arg->flags |= F_ALT; continue;
14957 case '0': arg->flags |= F_ZERO; continue;
14958 }
14959 break;
14960 }
14961
14962 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014963 if (arg->ch == '*') {
14964 v = unicode_format_getnextarg(ctx);
14965 if (v == NULL)
14966 return -1;
14967 if (!PyLong_Check(v)) {
14968 PyErr_SetString(PyExc_TypeError,
14969 "* wants int");
14970 return -1;
14971 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014972 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014973 if (arg->width == -1 && PyErr_Occurred())
14974 return -1;
14975 if (arg->width < 0) {
14976 arg->flags |= F_LJUST;
14977 arg->width = -arg->width;
14978 }
14979 if (--ctx->fmtcnt >= 0) {
14980 arg->ch = FORMAT_READ(ctx);
14981 ctx->fmtpos++;
14982 }
14983 }
14984 else if (arg->ch >= '0' && arg->ch <= '9') {
14985 arg->width = arg->ch - '0';
14986 while (--ctx->fmtcnt >= 0) {
14987 arg->ch = FORMAT_READ(ctx);
14988 ctx->fmtpos++;
14989 if (arg->ch < '0' || arg->ch > '9')
14990 break;
14991 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14992 mixing signed and unsigned comparison. Since arg->ch is between
14993 '0' and '9', casting to int is safe. */
14994 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14995 PyErr_SetString(PyExc_ValueError,
14996 "width too big");
14997 return -1;
14998 }
14999 arg->width = arg->width*10 + (arg->ch - '0');
15000 }
15001 }
15002
15003 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015004 if (arg->ch == '.') {
15005 arg->prec = 0;
15006 if (--ctx->fmtcnt >= 0) {
15007 arg->ch = FORMAT_READ(ctx);
15008 ctx->fmtpos++;
15009 }
15010 if (arg->ch == '*') {
15011 v = unicode_format_getnextarg(ctx);
15012 if (v == NULL)
15013 return -1;
15014 if (!PyLong_Check(v)) {
15015 PyErr_SetString(PyExc_TypeError,
15016 "* wants int");
15017 return -1;
15018 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015019 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015020 if (arg->prec == -1 && PyErr_Occurred())
15021 return -1;
15022 if (arg->prec < 0)
15023 arg->prec = 0;
15024 if (--ctx->fmtcnt >= 0) {
15025 arg->ch = FORMAT_READ(ctx);
15026 ctx->fmtpos++;
15027 }
15028 }
15029 else if (arg->ch >= '0' && arg->ch <= '9') {
15030 arg->prec = arg->ch - '0';
15031 while (--ctx->fmtcnt >= 0) {
15032 arg->ch = FORMAT_READ(ctx);
15033 ctx->fmtpos++;
15034 if (arg->ch < '0' || arg->ch > '9')
15035 break;
15036 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15037 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015038 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015039 return -1;
15040 }
15041 arg->prec = arg->prec*10 + (arg->ch - '0');
15042 }
15043 }
15044 }
15045
15046 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15047 if (ctx->fmtcnt >= 0) {
15048 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15049 if (--ctx->fmtcnt >= 0) {
15050 arg->ch = FORMAT_READ(ctx);
15051 ctx->fmtpos++;
15052 }
15053 }
15054 }
15055 if (ctx->fmtcnt < 0) {
15056 PyErr_SetString(PyExc_ValueError,
15057 "incomplete format");
15058 return -1;
15059 }
15060 return 0;
15061
15062#undef FORMAT_READ
15063}
15064
15065/* Format one argument. Supported conversion specifiers:
15066
15067 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015068 - "i", "d", "u": int or float
15069 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015070 - "e", "E", "f", "F", "g", "G": float
15071 - "c": int or str (1 character)
15072
Victor Stinner8dbd4212012-12-04 09:30:24 +010015073 When possible, the output is written directly into the Unicode writer
15074 (ctx->writer). A string is created when padding is required.
15075
Victor Stinnera47082312012-10-04 02:19:54 +020015076 Return 0 if the argument has been formatted into *p_str,
15077 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015078 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015079static int
15080unicode_format_arg_format(struct unicode_formatter_t *ctx,
15081 struct unicode_format_arg_t *arg,
15082 PyObject **p_str)
15083{
15084 PyObject *v;
15085 _PyUnicodeWriter *writer = &ctx->writer;
15086
15087 if (ctx->fmtcnt == 0)
15088 ctx->writer.overallocate = 0;
15089
Victor Stinnera47082312012-10-04 02:19:54 +020015090 v = unicode_format_getnextarg(ctx);
15091 if (v == NULL)
15092 return -1;
15093
Victor Stinnera47082312012-10-04 02:19:54 +020015094
15095 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015096 case 's':
15097 case 'r':
15098 case 'a':
15099 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15100 /* Fast path */
15101 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15102 return -1;
15103 return 1;
15104 }
15105
15106 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15107 *p_str = v;
15108 Py_INCREF(*p_str);
15109 }
15110 else {
15111 if (arg->ch == 's')
15112 *p_str = PyObject_Str(v);
15113 else if (arg->ch == 'r')
15114 *p_str = PyObject_Repr(v);
15115 else
15116 *p_str = PyObject_ASCII(v);
15117 }
15118 break;
15119
15120 case 'i':
15121 case 'd':
15122 case 'u':
15123 case 'o':
15124 case 'x':
15125 case 'X':
15126 {
15127 int ret = mainformatlong(v, arg, p_str, writer);
15128 if (ret != 0)
15129 return ret;
15130 arg->sign = 1;
15131 break;
15132 }
15133
15134 case 'e':
15135 case 'E':
15136 case 'f':
15137 case 'F':
15138 case 'g':
15139 case 'G':
15140 if (arg->width == -1 && arg->prec == -1
15141 && !(arg->flags & (F_SIGN | F_BLANK)))
15142 {
15143 /* Fast path */
15144 if (formatfloat(v, arg, NULL, writer) == -1)
15145 return -1;
15146 return 1;
15147 }
15148
15149 arg->sign = 1;
15150 if (formatfloat(v, arg, p_str, NULL) == -1)
15151 return -1;
15152 break;
15153
15154 case 'c':
15155 {
15156 Py_UCS4 ch = formatchar(v);
15157 if (ch == (Py_UCS4) -1)
15158 return -1;
15159 if (arg->width == -1 && arg->prec == -1) {
15160 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015161 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015162 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015163 return 1;
15164 }
15165 *p_str = PyUnicode_FromOrdinal(ch);
15166 break;
15167 }
15168
15169 default:
15170 PyErr_Format(PyExc_ValueError,
15171 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015172 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015173 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15174 (int)arg->ch,
15175 ctx->fmtpos - 1);
15176 return -1;
15177 }
15178 if (*p_str == NULL)
15179 return -1;
15180 assert (PyUnicode_Check(*p_str));
15181 return 0;
15182}
15183
15184static int
15185unicode_format_arg_output(struct unicode_formatter_t *ctx,
15186 struct unicode_format_arg_t *arg,
15187 PyObject *str)
15188{
15189 Py_ssize_t len;
15190 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015191 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015192 Py_ssize_t pindex;
15193 Py_UCS4 signchar;
15194 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015195 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015196 Py_ssize_t sublen;
15197 _PyUnicodeWriter *writer = &ctx->writer;
15198 Py_UCS4 fill;
15199
15200 fill = ' ';
15201 if (arg->sign && arg->flags & F_ZERO)
15202 fill = '0';
15203
15204 if (PyUnicode_READY(str) == -1)
15205 return -1;
15206
15207 len = PyUnicode_GET_LENGTH(str);
15208 if ((arg->width == -1 || arg->width <= len)
15209 && (arg->prec == -1 || arg->prec >= len)
15210 && !(arg->flags & (F_SIGN | F_BLANK)))
15211 {
15212 /* Fast path */
15213 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15214 return -1;
15215 return 0;
15216 }
15217
15218 /* Truncate the string for "s", "r" and "a" formats
15219 if the precision is set */
15220 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15221 if (arg->prec >= 0 && len > arg->prec)
15222 len = arg->prec;
15223 }
15224
15225 /* Adjust sign and width */
15226 kind = PyUnicode_KIND(str);
15227 pbuf = PyUnicode_DATA(str);
15228 pindex = 0;
15229 signchar = '\0';
15230 if (arg->sign) {
15231 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15232 if (ch == '-' || ch == '+') {
15233 signchar = ch;
15234 len--;
15235 pindex++;
15236 }
15237 else if (arg->flags & F_SIGN)
15238 signchar = '+';
15239 else if (arg->flags & F_BLANK)
15240 signchar = ' ';
15241 else
15242 arg->sign = 0;
15243 }
15244 if (arg->width < len)
15245 arg->width = len;
15246
15247 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015248 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015249 if (!(arg->flags & F_LJUST)) {
15250 if (arg->sign) {
15251 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015252 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015253 }
15254 else {
15255 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015256 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015257 }
15258 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015259 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15260 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015261 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015262 }
15263
Victor Stinnera47082312012-10-04 02:19:54 +020015264 buflen = arg->width;
15265 if (arg->sign && len == arg->width)
15266 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015267 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015268 return -1;
15269
15270 /* Write the sign if needed */
15271 if (arg->sign) {
15272 if (fill != ' ') {
15273 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15274 writer->pos += 1;
15275 }
15276 if (arg->width > len)
15277 arg->width--;
15278 }
15279
15280 /* Write the numeric prefix for "x", "X" and "o" formats
15281 if the alternate form is used.
15282 For example, write "0x" for the "%#x" format. */
15283 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15284 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15285 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15286 if (fill != ' ') {
15287 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15288 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15289 writer->pos += 2;
15290 pindex += 2;
15291 }
15292 arg->width -= 2;
15293 if (arg->width < 0)
15294 arg->width = 0;
15295 len -= 2;
15296 }
15297
15298 /* Pad left with the fill character if needed */
15299 if (arg->width > len && !(arg->flags & F_LJUST)) {
15300 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015301 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015302 writer->pos += sublen;
15303 arg->width = len;
15304 }
15305
15306 /* If padding with spaces: write sign if needed and/or numeric prefix if
15307 the alternate form is used */
15308 if (fill == ' ') {
15309 if (arg->sign) {
15310 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15311 writer->pos += 1;
15312 }
15313 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15314 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15315 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15316 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15317 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15318 writer->pos += 2;
15319 pindex += 2;
15320 }
15321 }
15322
15323 /* Write characters */
15324 if (len) {
15325 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15326 str, pindex, len);
15327 writer->pos += len;
15328 }
15329
15330 /* Pad right with the fill character if needed */
15331 if (arg->width > len) {
15332 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015333 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015334 writer->pos += sublen;
15335 }
15336 return 0;
15337}
15338
15339/* Helper of PyUnicode_Format(): format one arg.
15340 Return 0 on success, raise an exception and return -1 on error. */
15341static int
15342unicode_format_arg(struct unicode_formatter_t *ctx)
15343{
15344 struct unicode_format_arg_t arg;
15345 PyObject *str;
15346 int ret;
15347
Victor Stinner8dbd4212012-12-04 09:30:24 +010015348 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015349 if (arg.ch == '%') {
15350 ctx->fmtpos++;
15351 ctx->fmtcnt--;
15352 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15353 return -1;
15354 return 0;
15355 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015356 arg.flags = 0;
15357 arg.width = -1;
15358 arg.prec = -1;
15359 arg.sign = 0;
15360 str = NULL;
15361
Victor Stinnera47082312012-10-04 02:19:54 +020015362 ret = unicode_format_arg_parse(ctx, &arg);
15363 if (ret == -1)
15364 return -1;
15365
15366 ret = unicode_format_arg_format(ctx, &arg, &str);
15367 if (ret == -1)
15368 return -1;
15369
15370 if (ret != 1) {
15371 ret = unicode_format_arg_output(ctx, &arg, str);
15372 Py_DECREF(str);
15373 if (ret == -1)
15374 return -1;
15375 }
15376
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015377 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015378 PyErr_SetString(PyExc_TypeError,
15379 "not all arguments converted during string formatting");
15380 return -1;
15381 }
15382 return 0;
15383}
15384
Alexander Belopolsky40018472011-02-26 01:02:56 +000015385PyObject *
15386PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015387{
Victor Stinnera47082312012-10-04 02:19:54 +020015388 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015389
Guido van Rossumd57fd912000-03-10 22:53:23 +000015390 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015391 PyErr_BadInternalCall();
15392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015393 }
Victor Stinnera47082312012-10-04 02:19:54 +020015394
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015395 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015396 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015397
15398 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015399 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15400 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15401 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15402 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015403
Victor Stinner8f674cc2013-04-17 23:02:17 +020015404 _PyUnicodeWriter_Init(&ctx.writer);
15405 ctx.writer.min_length = ctx.fmtcnt + 100;
15406 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015407
Guido van Rossumd57fd912000-03-10 22:53:23 +000015408 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015409 ctx.arglen = PyTuple_Size(args);
15410 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015411 }
15412 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015413 ctx.arglen = -1;
15414 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015415 }
Victor Stinnera47082312012-10-04 02:19:54 +020015416 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015417 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015418 ctx.dict = args;
15419 else
15420 ctx.dict = NULL;
15421 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015422
Victor Stinnera47082312012-10-04 02:19:54 +020015423 while (--ctx.fmtcnt >= 0) {
15424 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015425 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015426
15427 nonfmtpos = ctx.fmtpos++;
15428 while (ctx.fmtcnt >= 0 &&
15429 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15430 ctx.fmtpos++;
15431 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 }
Victor Stinnera47082312012-10-04 02:19:54 +020015433 if (ctx.fmtcnt < 0) {
15434 ctx.fmtpos--;
15435 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015436 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015437
Victor Stinnercfc4c132013-04-03 01:48:39 +020015438 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15439 nonfmtpos, ctx.fmtpos) < 0)
15440 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015441 }
15442 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015443 ctx.fmtpos++;
15444 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015445 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015446 }
15447 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015448
Victor Stinnera47082312012-10-04 02:19:54 +020015449 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015450 PyErr_SetString(PyExc_TypeError,
15451 "not all arguments converted during string formatting");
15452 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015453 }
15454
Victor Stinnera47082312012-10-04 02:19:54 +020015455 if (ctx.args_owned) {
15456 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015457 }
Victor Stinnera47082312012-10-04 02:19:54 +020015458 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015459
Benjamin Peterson29060642009-01-31 22:14:21 +000015460 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015461 _PyUnicodeWriter_Dealloc(&ctx.writer);
15462 if (ctx.args_owned) {
15463 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015464 }
15465 return NULL;
15466}
15467
Jeremy Hylton938ace62002-07-17 16:30:39 +000015468static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015469unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15470
15471/*[clinic input]
15472@classmethod
15473str.__new__ as unicode_new
15474
15475 object as x: object = NULL
15476 encoding: str = NULL
15477 errors: str = NULL
15478
15479[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015480
Tim Peters6d6c1a32001-08-02 04:15:00 +000015481static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015482unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15483 const char *errors)
15484/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015485{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015486 PyObject *unicode;
15487 if (x == NULL) {
15488 unicode = unicode_new_empty();
15489 }
15490 else if (encoding == NULL && errors == NULL) {
15491 unicode = PyObject_Str(x);
15492 }
15493 else {
15494 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15495 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015496
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015497 if (unicode != NULL && type != &PyUnicode_Type) {
15498 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15499 }
15500 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015501}
15502
Guido van Rossume023fe02001-08-30 03:12:59 +000015503static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015504unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015505{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015506 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015507 Py_ssize_t length, char_size;
15508 int share_wstr, share_utf8;
15509 unsigned int kind;
15510 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015511
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015513 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015514 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015515 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015516 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015517
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015518 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015519 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015520 return NULL;
15521 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015522 kind = PyUnicode_KIND(unicode);
15523 length = PyUnicode_GET_LENGTH(unicode);
15524
15525 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015526#ifdef Py_DEBUG
15527 _PyUnicode_HASH(self) = -1;
15528#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015529 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015530#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015531 _PyUnicode_STATE(self).interned = 0;
15532 _PyUnicode_STATE(self).kind = kind;
15533 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015534 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015535 _PyUnicode_STATE(self).ready = 1;
15536 _PyUnicode_WSTR(self) = NULL;
15537 _PyUnicode_UTF8_LENGTH(self) = 0;
15538 _PyUnicode_UTF8(self) = NULL;
15539 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015540 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015541
15542 share_utf8 = 0;
15543 share_wstr = 0;
15544 if (kind == PyUnicode_1BYTE_KIND) {
15545 char_size = 1;
15546 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15547 share_utf8 = 1;
15548 }
15549 else if (kind == PyUnicode_2BYTE_KIND) {
15550 char_size = 2;
15551 if (sizeof(wchar_t) == 2)
15552 share_wstr = 1;
15553 }
15554 else {
15555 assert(kind == PyUnicode_4BYTE_KIND);
15556 char_size = 4;
15557 if (sizeof(wchar_t) == 4)
15558 share_wstr = 1;
15559 }
15560
15561 /* Ensure we won't overflow the length. */
15562 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15563 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015564 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015565 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015566 data = PyObject_MALLOC((length + 1) * char_size);
15567 if (data == NULL) {
15568 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015569 goto onError;
15570 }
15571
Victor Stinnerc3c74152011-10-02 20:39:55 +020015572 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015573 if (share_utf8) {
15574 _PyUnicode_UTF8_LENGTH(self) = length;
15575 _PyUnicode_UTF8(self) = data;
15576 }
15577 if (share_wstr) {
15578 _PyUnicode_WSTR_LENGTH(self) = length;
15579 _PyUnicode_WSTR(self) = (wchar_t *)data;
15580 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015581
Christian Heimesf051e432016-09-13 20:22:02 +020015582 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015583 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015584 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015585#ifdef Py_DEBUG
15586 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15587#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015588 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015589
15590onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015591 Py_DECREF(self);
15592 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015593}
15594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015595PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015596"str(object='') -> str\n\
15597str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015598\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015599Create a new string object from the given object. If encoding or\n\
15600errors is specified, then the object must expose a data buffer\n\
15601that will be decoded using the given encoding and error handler.\n\
15602Otherwise, returns the result of object.__str__() (if defined)\n\
15603or repr(object).\n\
15604encoding defaults to sys.getdefaultencoding().\n\
15605errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015606
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015607static PyObject *unicode_iter(PyObject *seq);
15608
Guido van Rossumd57fd912000-03-10 22:53:23 +000015609PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015610 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015611 "str", /* tp_name */
15612 sizeof(PyUnicodeObject), /* tp_basicsize */
15613 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015614 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015615 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015616 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015617 0, /* tp_getattr */
15618 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015619 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015620 unicode_repr, /* tp_repr */
15621 &unicode_as_number, /* tp_as_number */
15622 &unicode_as_sequence, /* tp_as_sequence */
15623 &unicode_as_mapping, /* tp_as_mapping */
15624 (hashfunc) unicode_hash, /* tp_hash*/
15625 0, /* tp_call*/
15626 (reprfunc) unicode_str, /* tp_str */
15627 PyObject_GenericGetAttr, /* tp_getattro */
15628 0, /* tp_setattro */
15629 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015630 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015631 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15632 unicode_doc, /* tp_doc */
15633 0, /* tp_traverse */
15634 0, /* tp_clear */
15635 PyUnicode_RichCompare, /* tp_richcompare */
15636 0, /* tp_weaklistoffset */
15637 unicode_iter, /* tp_iter */
15638 0, /* tp_iternext */
15639 unicode_methods, /* tp_methods */
15640 0, /* tp_members */
15641 0, /* tp_getset */
15642 &PyBaseObject_Type, /* tp_base */
15643 0, /* tp_dict */
15644 0, /* tp_descr_get */
15645 0, /* tp_descr_set */
15646 0, /* tp_dictoffset */
15647 0, /* tp_init */
15648 0, /* tp_alloc */
15649 unicode_new, /* tp_new */
15650 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015651};
15652
15653/* Initialize the Unicode implementation */
15654
Victor Stinner331a6a52019-05-27 16:39:22 +020015655PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015656_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015657{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015658 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015659 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015660 0x000A, /* LINE FEED */
15661 0x000D, /* CARRIAGE RETURN */
15662 0x001C, /* FILE SEPARATOR */
15663 0x001D, /* GROUP SEPARATOR */
15664 0x001E, /* RECORD SEPARATOR */
15665 0x0085, /* NEXT LINE */
15666 0x2028, /* LINE SEPARATOR */
15667 0x2029, /* PARAGRAPH SEPARATOR */
15668 };
15669
Victor Stinner91698d82020-06-25 14:07:40 +020015670 struct _Py_unicode_state *state = &tstate->interp->unicode;
15671 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015672 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015673 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015674
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015675 if (_Py_IsMainInterpreter(tstate)) {
15676 /* initialize the linebreak bloom filter */
15677 bloom_linebreak = make_bloom_mask(
15678 PyUnicode_2BYTE_KIND, linebreak,
15679 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015680
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015681 if (PyType_Ready(&PyUnicode_Type) < 0) {
15682 return _PyStatus_ERR("Can't initialize unicode type");
15683 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015684
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015685 if (PyType_Ready(&EncodingMapType) < 0) {
15686 return _PyStatus_ERR("Can't initialize encoding map type");
15687 }
15688 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15689 return _PyStatus_ERR("Can't initialize field name iterator type");
15690 }
15691 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15692 return _PyStatus_ERR("Can't initialize formatter iter type");
15693 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015694 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015695 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015696}
15697
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015698
Walter Dörwald16807132007-05-25 13:52:07 +000015699void
15700PyUnicode_InternInPlace(PyObject **p)
15701{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015702 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015703#ifdef Py_DEBUG
15704 assert(s != NULL);
15705 assert(_PyUnicode_CHECK(s));
15706#else
Victor Stinner607b1022020-05-05 18:50:30 +020015707 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015708 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015709 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015710#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015711
Benjamin Peterson14339b62009-01-31 16:36:08 +000015712 /* If it's a subclass, we don't really know what putting
15713 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015714 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015715 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015716 }
15717
15718 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015719 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015720 }
15721
15722#ifdef INTERNED_STRINGS
Victor Stinner666ecfb2020-07-02 01:19:57 +020015723 if (PyUnicode_READY(s) == -1) {
15724 PyErr_Clear();
15725 return;
15726 }
15727
Benjamin Peterson14339b62009-01-31 16:36:08 +000015728 if (interned == NULL) {
15729 interned = PyDict_New();
15730 if (interned == NULL) {
15731 PyErr_Clear(); /* Don't leave an exception */
15732 return;
15733 }
15734 }
Victor Stinner607b1022020-05-05 18:50:30 +020015735
15736 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015737 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015738 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015739 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015740
Berker Peksagced8d4c2016-07-25 04:40:39 +030015741 if (t == NULL) {
15742 PyErr_Clear();
15743 return;
15744 }
Victor Stinner607b1022020-05-05 18:50:30 +020015745
Berker Peksagced8d4c2016-07-25 04:40:39 +030015746 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015747 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015748 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015749 return;
15750 }
Victor Stinner607b1022020-05-05 18:50:30 +020015751
Victor Stinner3549ca32020-07-03 16:59:12 +020015752 /* The two references in interned dict (key and value) are not counted by
15753 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15754 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015755 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015756 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015757#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015758}
15759
15760void
15761PyUnicode_InternImmortal(PyObject **p)
15762{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015763 PyUnicode_InternInPlace(p);
15764 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015765 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015766 Py_INCREF(*p);
15767 }
Walter Dörwald16807132007-05-25 13:52:07 +000015768}
15769
15770PyObject *
15771PyUnicode_InternFromString(const char *cp)
15772{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015773 PyObject *s = PyUnicode_FromString(cp);
15774 if (s == NULL)
15775 return NULL;
15776 PyUnicode_InternInPlace(&s);
15777 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015778}
15779
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015780
Victor Stinner666ecfb2020-07-02 01:19:57 +020015781void
15782_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015783{
Victor Stinner666ecfb2020-07-02 01:19:57 +020015784 if (!_Py_IsMainInterpreter(tstate)) {
15785 // interned dict is shared by all interpreters
Benjamin Peterson14339b62009-01-31 16:36:08 +000015786 return;
15787 }
Walter Dörwald16807132007-05-25 13:52:07 +000015788
Victor Stinner666ecfb2020-07-02 01:19:57 +020015789 if (interned == NULL) {
15790 return;
15791 }
15792 assert(PyDict_CheckExact(interned));
15793
15794 PyObject *keys = PyDict_Keys(interned);
15795 if (keys == NULL) {
15796 PyErr_Clear();
15797 return;
15798 }
15799 assert(PyList_CheckExact(keys));
15800
15801 /* Interned unicode strings are not forcibly deallocated; rather, we give
15802 them their stolen references back, and then clear and DECREF the
15803 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015804
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015805 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015806#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015807 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015808
15809 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015810#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015811 for (Py_ssize_t i = 0; i < n; i++) {
15812 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015813 assert(PyUnicode_IS_READY(s));
15814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015815 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015816 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015817 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015818#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015819 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015820#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015821 break;
15822 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015823 // Restore the two references (key and value) ignored
15824 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015825 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015826#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015827 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015828#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015829 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015830 case SSTATE_NOT_INTERNED:
15831 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015832 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015833 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015835 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015836 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015837#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015838 fprintf(stderr,
15839 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15840 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015841#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015842 Py_DECREF(keys);
Victor Stinner666ecfb2020-07-02 01:19:57 +020015843
Benjamin Peterson14339b62009-01-31 16:36:08 +000015844 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015845 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015846}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015847
15848
15849/********************* Unicode Iterator **************************/
15850
15851typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015852 PyObject_HEAD
15853 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015854 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015855} unicodeiterobject;
15856
15857static void
15858unicodeiter_dealloc(unicodeiterobject *it)
15859{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015860 _PyObject_GC_UNTRACK(it);
15861 Py_XDECREF(it->it_seq);
15862 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015863}
15864
15865static int
15866unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15867{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015868 Py_VISIT(it->it_seq);
15869 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015870}
15871
15872static PyObject *
15873unicodeiter_next(unicodeiterobject *it)
15874{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015875 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015876
Benjamin Peterson14339b62009-01-31 16:36:08 +000015877 assert(it != NULL);
15878 seq = it->it_seq;
15879 if (seq == NULL)
15880 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015881 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015883 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15884 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015885 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015886 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15887 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015888 if (item != NULL)
15889 ++it->it_index;
15890 return item;
15891 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015892
Benjamin Peterson14339b62009-01-31 16:36:08 +000015893 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015894 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015895 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015896}
15897
15898static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015899unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015900{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015901 Py_ssize_t len = 0;
15902 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015903 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015904 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015905}
15906
15907PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15908
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015909static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015910unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015911{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015912 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015913 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015914 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015915 it->it_seq, it->it_index);
15916 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015917 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015918 if (u == NULL)
15919 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015920 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015921 }
15922}
15923
15924PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15925
15926static PyObject *
15927unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15928{
15929 Py_ssize_t index = PyLong_AsSsize_t(state);
15930 if (index == -1 && PyErr_Occurred())
15931 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015932 if (it->it_seq != NULL) {
15933 if (index < 0)
15934 index = 0;
15935 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15936 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15937 it->it_index = index;
15938 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015939 Py_RETURN_NONE;
15940}
15941
15942PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15943
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015944static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015945 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015946 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015947 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15948 reduce_doc},
15949 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15950 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015951 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015952};
15953
15954PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015955 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15956 "str_iterator", /* tp_name */
15957 sizeof(unicodeiterobject), /* tp_basicsize */
15958 0, /* tp_itemsize */
15959 /* methods */
15960 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015961 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015962 0, /* tp_getattr */
15963 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015964 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015965 0, /* tp_repr */
15966 0, /* tp_as_number */
15967 0, /* tp_as_sequence */
15968 0, /* tp_as_mapping */
15969 0, /* tp_hash */
15970 0, /* tp_call */
15971 0, /* tp_str */
15972 PyObject_GenericGetAttr, /* tp_getattro */
15973 0, /* tp_setattro */
15974 0, /* tp_as_buffer */
15975 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15976 0, /* tp_doc */
15977 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15978 0, /* tp_clear */
15979 0, /* tp_richcompare */
15980 0, /* tp_weaklistoffset */
15981 PyObject_SelfIter, /* tp_iter */
15982 (iternextfunc)unicodeiter_next, /* tp_iternext */
15983 unicodeiter_methods, /* tp_methods */
15984 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015985};
15986
15987static PyObject *
15988unicode_iter(PyObject *seq)
15989{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015990 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015991
Benjamin Peterson14339b62009-01-31 16:36:08 +000015992 if (!PyUnicode_Check(seq)) {
15993 PyErr_BadInternalCall();
15994 return NULL;
15995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015996 if (PyUnicode_READY(seq) == -1)
15997 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015998 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15999 if (it == NULL)
16000 return NULL;
16001 it->it_index = 0;
16002 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016003 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016004 _PyObject_GC_TRACK(it);
16005 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016006}
16007
Victor Stinner709d23d2019-05-02 14:56:30 -040016008static int
16009encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016010{
Victor Stinner709d23d2019-05-02 14:56:30 -040016011 int res;
16012 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16013 if (res == -2) {
16014 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16015 return -1;
16016 }
16017 if (res < 0) {
16018 PyErr_NoMemory();
16019 return -1;
16020 }
16021 return 0;
16022}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016023
Victor Stinner709d23d2019-05-02 14:56:30 -040016024
16025static int
16026config_get_codec_name(wchar_t **config_encoding)
16027{
16028 char *encoding;
16029 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16030 return -1;
16031 }
16032
16033 PyObject *name_obj = NULL;
16034 PyObject *codec = _PyCodec_Lookup(encoding);
16035 PyMem_RawFree(encoding);
16036
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016037 if (!codec)
16038 goto error;
16039
16040 name_obj = PyObject_GetAttrString(codec, "name");
16041 Py_CLEAR(codec);
16042 if (!name_obj) {
16043 goto error;
16044 }
16045
Victor Stinner709d23d2019-05-02 14:56:30 -040016046 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16047 Py_DECREF(name_obj);
16048 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016049 goto error;
16050 }
16051
Victor Stinner709d23d2019-05-02 14:56:30 -040016052 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16053 if (raw_wname == NULL) {
16054 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016055 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016056 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016057 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016058
16059 PyMem_RawFree(*config_encoding);
16060 *config_encoding = raw_wname;
16061
16062 PyMem_Free(wname);
16063 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016064
16065error:
16066 Py_XDECREF(codec);
16067 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016068 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016069}
16070
16071
Victor Stinner331a6a52019-05-27 16:39:22 +020016072static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016073init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016074{
Victor Stinner709d23d2019-05-02 14:56:30 -040016075 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016076 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016077 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016078 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016079 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016080 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016081 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016082}
16083
16084
Victor Stinner709d23d2019-05-02 14:56:30 -040016085static int
16086init_fs_codec(PyInterpreterState *interp)
16087{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016088 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016089
16090 _Py_error_handler error_handler;
16091 error_handler = get_error_handler_wide(config->filesystem_errors);
16092 if (error_handler == _Py_ERROR_UNKNOWN) {
16093 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16094 return -1;
16095 }
16096
16097 char *encoding, *errors;
16098 if (encode_wstr_utf8(config->filesystem_encoding,
16099 &encoding,
16100 "filesystem_encoding") < 0) {
16101 return -1;
16102 }
16103
16104 if (encode_wstr_utf8(config->filesystem_errors,
16105 &errors,
16106 "filesystem_errors") < 0) {
16107 PyMem_RawFree(encoding);
16108 return -1;
16109 }
16110
Victor Stinner3d17c042020-05-14 01:48:38 +020016111 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16112 PyMem_RawFree(fs_codec->encoding);
16113 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016114 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016115 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16116 PyMem_RawFree(fs_codec->errors);
16117 fs_codec->errors = errors;
16118 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016119
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016120#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016121 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016122#endif
16123
Victor Stinner709d23d2019-05-02 14:56:30 -040016124 /* At this point, PyUnicode_EncodeFSDefault() and
16125 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16126 the C implementation of the filesystem encoding. */
16127
16128 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16129 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016130 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16131 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016132 PyErr_NoMemory();
16133 return -1;
16134 }
16135 return 0;
16136}
16137
16138
Victor Stinner331a6a52019-05-27 16:39:22 +020016139static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016140init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016141{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016142 PyInterpreterState *interp = tstate->interp;
16143
Victor Stinner709d23d2019-05-02 14:56:30 -040016144 /* Update the filesystem encoding to the normalized Python codec name.
16145 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16146 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016147 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016148 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016149 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016150 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016151 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016152 }
16153
Victor Stinner709d23d2019-05-02 14:56:30 -040016154 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016155 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016156 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016157 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016158}
16159
16160
Victor Stinner331a6a52019-05-27 16:39:22 +020016161PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016162_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016163{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016164 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016165 if (_PyStatus_EXCEPTION(status)) {
16166 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016167 }
16168
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016169 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016170}
16171
16172
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016173static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016174_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016175{
Victor Stinner3d17c042020-05-14 01:48:38 +020016176 PyMem_RawFree(fs_codec->encoding);
16177 fs_codec->encoding = NULL;
16178 fs_codec->utf8 = 0;
16179 PyMem_RawFree(fs_codec->errors);
16180 fs_codec->errors = NULL;
16181 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016182}
16183
16184
Victor Stinner709d23d2019-05-02 14:56:30 -040016185#ifdef MS_WINDOWS
16186int
16187_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16188{
Victor Stinner81a7be32020-04-14 15:14:01 +020016189 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016190 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016191
16192 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16193 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16194 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16195 if (encoding == NULL || errors == NULL) {
16196 PyMem_RawFree(encoding);
16197 PyMem_RawFree(errors);
16198 PyErr_NoMemory();
16199 return -1;
16200 }
16201
16202 PyMem_RawFree(config->filesystem_encoding);
16203 config->filesystem_encoding = encoding;
16204 PyMem_RawFree(config->filesystem_errors);
16205 config->filesystem_errors = errors;
16206
16207 return init_fs_codec(interp);
16208}
16209#endif
16210
16211
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016212void
Victor Stinner3d483342019-11-22 12:27:50 +010016213_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016214{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016215 // _PyUnicode_ClearInterned() must be called before
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016216
Victor Stinner666ecfb2020-07-02 01:19:57 +020016217 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016218
Victor Stinner91698d82020-06-25 14:07:40 +020016219 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016220
Victor Stinner2f9ada92020-06-24 02:22:21 +020016221 for (Py_ssize_t i = 0; i < 256; i++) {
16222 Py_CLEAR(state->latin1[i]);
16223 }
16224
Victor Stinner666ecfb2020-07-02 01:19:57 +020016225 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016226 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016227 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016228
Victor Stinner3d17c042020-05-14 01:48:38 +020016229 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016230}
16231
16232
Georg Brandl66c221e2010-10-14 07:04:07 +000016233/* A _string module, to export formatter_parser and formatter_field_name_split
16234 to the string.Formatter class implemented in Python. */
16235
16236static PyMethodDef _string_methods[] = {
16237 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16238 METH_O, PyDoc_STR("split the argument as a field name")},
16239 {"formatter_parser", (PyCFunction) formatter_parser,
16240 METH_O, PyDoc_STR("parse the argument as a format string")},
16241 {NULL, NULL}
16242};
16243
16244static struct PyModuleDef _string_module = {
16245 PyModuleDef_HEAD_INIT,
16246 "_string",
16247 PyDoc_STR("string helper module"),
16248 0,
16249 _string_methods,
16250 NULL,
16251 NULL,
16252 NULL,
16253 NULL
16254};
16255
16256PyMODINIT_FUNC
16257PyInit__string(void)
16258{
16259 return PyModule_Create(&_string_module);
16260}
16261
16262
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016263#ifdef __cplusplus
16264}
16265#endif