blob: fe46de2ae47435650a44bcdcce690a27811fd231 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner91698d82020-06-25 14:07:40 +020044#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
Victor Stinnere5014be2020-04-14 17:52:15 +020046#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinner91698d82020-06-25 14:07:40 +020047#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
Victor Stinnere5014be2020-04-14 17:52:15 +020050#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner91698d82020-06-25 14:07:40 +020051#include "ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinnerfecc4f22019-03-19 14:20:29 +010058/* Uncomment to display statistics on interned strings at exit when
59 using Valgrind or Insecure++. */
60/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200500int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100501_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200502{
Victor Stinner68762572019-10-07 18:42:01 +0200503#define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
Victor Stinner910337b2011-10-03 03:20:16 +0200506 PyASCIIObject *ascii;
507 unsigned int kind;
508
Victor Stinner68762572019-10-07 18:42:01 +0200509 assert(op != NULL);
510 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200511
512 ascii = (PyASCIIObject *)op;
513 kind = ascii->state.kind;
514
Victor Stinnera3b334d2011-10-03 13:53:37 +0200515 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(kind == PyUnicode_1BYTE_KIND);
517 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200518 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200520 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200521 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200522
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->state.compact == 1) {
524 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526 || kind == PyUnicode_2BYTE_KIND
527 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 1);
530 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100531 }
532 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
534
535 data = unicode->data.any;
536 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200537 CHECK(ascii->length == 0);
538 CHECK(ascii->hash == -1);
539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ascii == 0);
541 CHECK(ascii->state.ready == 0);
542 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
543 CHECK(ascii->wstr != NULL);
544 CHECK(data == NULL);
545 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200546 }
547 else {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200549 || kind == PyUnicode_2BYTE_KIND
550 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->state.compact == 0);
552 CHECK(ascii->state.ready == 1);
553 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(compact->utf8 == data);
556 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 }
558 else
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 }
562 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200563 if (
564#if SIZEOF_WCHAR_T == 2
565 kind == PyUnicode_2BYTE_KIND
566#else
567 kind == PyUnicode_4BYTE_KIND
568#endif
569 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200570 {
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(ascii->wstr == data);
572 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 } else
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200576
577 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200578 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200579 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200580 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200581 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200582
583 /* check that the best kind is used: O(n) operation */
584 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 Py_ssize_t i;
586 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300587 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200588 Py_UCS4 ch;
589
590 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 for (i=0; i < ascii->length; i++)
592 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200593 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200594 if (ch > maxchar)
595 maxchar = ch;
596 }
597 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100598 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 128);
600 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 else
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200604 }
Victor Stinner77faf692011-11-20 18:56:05 +0100605 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(maxchar >= 0x100);
607 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100608 }
609 else {
Victor Stinner68762572019-10-07 18:42:01 +0200610 CHECK(maxchar >= 0x10000);
611 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100612 }
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200614 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400615 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200616
617#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400618}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200619
Victor Stinner910337b2011-10-03 03:20:16 +0200620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621static PyObject*
622unicode_result_wchar(PyObject *unicode)
623{
624#ifndef Py_DEBUG
625 Py_ssize_t len;
626
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 len = _PyUnicode_WSTR_LENGTH(unicode);
628 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200630 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 }
632
633 if (len == 1) {
634 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100635 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200637 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 }
639 }
640
641 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200642 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 return NULL;
644 }
645#else
Victor Stinneraa771272012-10-04 02:32:58 +0200646 assert(Py_REFCNT(unicode) == 1);
647
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 /* don't make the result ready in debug mode to ensure that the caller
649 makes the string ready before using it */
650 assert(_PyUnicode_CheckConsistency(unicode, 1));
651#endif
652 return unicode;
653}
654
655static PyObject*
656unicode_result_ready(PyObject *unicode)
657{
658 Py_ssize_t length;
659
660 length = PyUnicode_GET_LENGTH(unicode);
661 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200662 PyObject *empty = unicode_get_empty();
663 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100664 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200665 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100666 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200667 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 }
669
670 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200671 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200672 if (kind == PyUnicode_1BYTE_KIND) {
673 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
674 Py_UCS1 ch = data[0];
675 struct _Py_unicode_state *state = get_unicode_state();
676 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 if (latin1_char != NULL) {
678 if (unicode != latin1_char) {
679 Py_INCREF(latin1_char);
680 Py_DECREF(unicode);
681 }
682 return latin1_char;
683 }
684 else {
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200687 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100688 return unicode;
689 }
690 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200691 else {
692 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
693 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100694 }
695
696 assert(_PyUnicode_CheckConsistency(unicode, 1));
697 return unicode;
698}
699
700static PyObject*
701unicode_result(PyObject *unicode)
702{
703 assert(_PyUnicode_CHECK(unicode));
704 if (PyUnicode_IS_READY(unicode))
705 return unicode_result_ready(unicode);
706 else
707 return unicode_result_wchar(unicode);
708}
709
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710static PyObject*
711unicode_result_unchanged(PyObject *unicode)
712{
713 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500714 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100715 return NULL;
716 Py_INCREF(unicode);
717 return unicode;
718 }
719 else
720 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100721 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100722}
723
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
725 ASCII, Latin1, UTF-8, etc. */
726static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200727backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
729{
Victor Stinnerad771582015-10-09 12:38:53 +0200730 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200731 Py_UCS4 ch;
732 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300733 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734
735 assert(PyUnicode_IS_READY(unicode));
736 kind = PyUnicode_KIND(unicode);
737 data = PyUnicode_DATA(unicode);
738
739 size = 0;
740 /* determine replacement size */
741 for (i = collstart; i < collend; ++i) {
742 Py_ssize_t incr;
743
744 ch = PyUnicode_READ(kind, data, i);
745 if (ch < 0x100)
746 incr = 2+2;
747 else if (ch < 0x10000)
748 incr = 2+4;
749 else {
750 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200751 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752 }
753 if (size > PY_SSIZE_T_MAX - incr) {
754 PyErr_SetString(PyExc_OverflowError,
755 "encoded result is too long for a Python string");
756 return NULL;
757 }
758 size += incr;
759 }
760
Victor Stinnerad771582015-10-09 12:38:53 +0200761 str = _PyBytesWriter_Prepare(writer, str, size);
762 if (str == NULL)
763 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200764
765 /* generate replacement */
766 for (i = collstart; i < collend; ++i) {
767 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200768 *str++ = '\\';
769 if (ch >= 0x00010000) {
770 *str++ = 'U';
771 *str++ = Py_hexdigits[(ch>>28)&0xf];
772 *str++ = Py_hexdigits[(ch>>24)&0xf];
773 *str++ = Py_hexdigits[(ch>>20)&0xf];
774 *str++ = Py_hexdigits[(ch>>16)&0xf];
775 *str++ = Py_hexdigits[(ch>>12)&0xf];
776 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
Victor Stinner797485e2015-10-09 03:17:30 +0200778 else if (ch >= 0x100) {
779 *str++ = 'u';
780 *str++ = Py_hexdigits[(ch>>12)&0xf];
781 *str++ = Py_hexdigits[(ch>>8)&0xf];
782 }
783 else
784 *str++ = 'x';
785 *str++ = Py_hexdigits[(ch>>4)&0xf];
786 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 }
788 return str;
789}
790
791/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
792 ASCII, Latin1, UTF-8, etc. */
793static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200794xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200795 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
796{
Victor Stinnerad771582015-10-09 12:38:53 +0200797 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200798 Py_UCS4 ch;
799 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300800 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801
802 assert(PyUnicode_IS_READY(unicode));
803 kind = PyUnicode_KIND(unicode);
804 data = PyUnicode_DATA(unicode);
805
806 size = 0;
807 /* determine replacement size */
808 for (i = collstart; i < collend; ++i) {
809 Py_ssize_t incr;
810
811 ch = PyUnicode_READ(kind, data, i);
812 if (ch < 10)
813 incr = 2+1+1;
814 else if (ch < 100)
815 incr = 2+2+1;
816 else if (ch < 1000)
817 incr = 2+3+1;
818 else if (ch < 10000)
819 incr = 2+4+1;
820 else if (ch < 100000)
821 incr = 2+5+1;
822 else if (ch < 1000000)
823 incr = 2+6+1;
824 else {
825 assert(ch <= MAX_UNICODE);
826 incr = 2+7+1;
827 }
828 if (size > PY_SSIZE_T_MAX - incr) {
829 PyErr_SetString(PyExc_OverflowError,
830 "encoded result is too long for a Python string");
831 return NULL;
832 }
833 size += incr;
834 }
835
Victor Stinnerad771582015-10-09 12:38:53 +0200836 str = _PyBytesWriter_Prepare(writer, str, size);
837 if (str == NULL)
838 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200839
840 /* generate replacement */
841 for (i = collstart; i < collend; ++i) {
842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
843 }
844 return str;
845}
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847/* --- Bloom Filters ----------------------------------------------------- */
848
849/* stuff to implement simple "bloom filters" for Unicode characters.
850 to keep things simple, we use a single bitmask, using the least 5
851 bits from each unicode characters as the bit index. */
852
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200853/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854
Antoine Pitrouf068f942010-01-13 14:19:12 +0000855#if LONG_BIT >= 128
856#define BLOOM_WIDTH 128
857#elif LONG_BIT >= 64
858#define BLOOM_WIDTH 64
859#elif LONG_BIT >= 32
860#define BLOOM_WIDTH 32
861#else
862#error "LONG_BIT is smaller than 32"
863#endif
864
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865#define BLOOM_MASK unsigned long
866
Serhiy Storchaka05997252013-01-26 12:14:02 +0200867static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000870
Benjamin Peterson29060642009-01-31 22:14:21 +0000871#define BLOOM_LINEBREAK(ch) \
872 ((ch) < 128U ? ascii_linebreak[(ch)] : \
873 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000874
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700875static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300876make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877{
Victor Stinnera85af502013-04-09 21:53:54 +0200878#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
879 do { \
880 TYPE *data = (TYPE *)PTR; \
881 TYPE *end = data + LEN; \
882 Py_UCS4 ch; \
883 for (; data != end; data++) { \
884 ch = *data; \
885 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
886 } \
887 break; \
888 } while (0)
889
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 /* calculate simple bloom-style bitmask for a given unicode string */
891
Antoine Pitrouf068f942010-01-13 14:19:12 +0000892 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893
894 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200895 switch (kind) {
896 case PyUnicode_1BYTE_KIND:
897 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
898 break;
899 case PyUnicode_2BYTE_KIND:
900 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
901 break;
902 case PyUnicode_4BYTE_KIND:
903 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
904 break;
905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700906 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000908 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200909
910#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000911}
912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300913static int
914ensure_unicode(PyObject *obj)
915{
916 if (!PyUnicode_Check(obj)) {
917 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200918 "must be str, not %.100s",
919 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300920 return -1;
921 }
922 return PyUnicode_READY(obj);
923}
924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925/* Compilation of templated routines */
926
Victor Stinner90ed8a62020-06-24 00:34:07 +0200927#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200928
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/asciilib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs1lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
948#include "stringlib/ucs2lib.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/partition.h"
951#include "stringlib/split.h"
952#include "stringlib/count.h"
953#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300954#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200955#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200956#include "stringlib/undef.h"
957
958#include "stringlib/ucs4lib.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/partition.h"
961#include "stringlib/split.h"
962#include "stringlib/count.h"
963#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300964#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200965#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200966#include "stringlib/undef.h"
967
Inada Naoki2c4928d2020-06-17 20:09:44 +0900968_Py_COMP_DIAG_PUSH
969_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970#include "stringlib/unicodedefs.h"
971#include "stringlib/fastsearch.h"
972#include "stringlib/count.h"
973#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100974#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900975_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200977#undef STRINGLIB_GET_EMPTY
978
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979/* --- Unicode Object ----------------------------------------------------- */
980
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700981static inline Py_ssize_t
982findchar(const void *s, int kind,
983 Py_ssize_t size, Py_UCS4 ch,
984 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 switch (kind) {
987 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200988 if ((Py_UCS1) ch != ch)
989 return -1;
990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200995 if ((Py_UCS2) ch != ch)
996 return -1;
997 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600998 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200999 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001000 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001001 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001003 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001004 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001006 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001007 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009}
1010
Victor Stinnerafffce42012-10-03 23:03:17 +02001011#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001012/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001013 earlier.
1014
1015 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1016 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1017 invalid character in Unicode 6.0. */
1018static void
1019unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1020{
1021 int kind = PyUnicode_KIND(unicode);
1022 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1023 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1024 if (length <= old_length)
1025 return;
1026 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1027}
1028#endif
1029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030static PyObject*
1031resize_compact(PyObject *unicode, Py_ssize_t length)
1032{
1033 Py_ssize_t char_size;
1034 Py_ssize_t struct_size;
1035 Py_ssize_t new_size;
1036 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001037 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001038#ifdef Py_DEBUG
1039 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1040#endif
1041
Victor Stinner79891572012-05-03 13:43:07 +02001042 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001044 assert(PyUnicode_IS_COMPACT(unicode));
1045
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001046 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001047 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048 struct_size = sizeof(PyASCIIObject);
1049 else
1050 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001051 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1054 PyErr_NoMemory();
1055 return NULL;
1056 }
1057 new_size = (struct_size + (length + 1) * char_size);
1058
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001059 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1060 PyObject_DEL(_PyUnicode_UTF8(unicode));
1061 _PyUnicode_UTF8(unicode) = NULL;
1062 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1063 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001064#ifdef Py_REF_DEBUG
1065 _Py_RefTotal--;
1066#endif
1067#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001068 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001069#endif
Victor Stinner84def372011-12-11 20:04:56 +01001070
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001071 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001072 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001073 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 PyErr_NoMemory();
1075 return NULL;
1076 }
Victor Stinner84def372011-12-11 20:04:56 +01001077 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001081 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001083 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 _PyUnicode_WSTR_LENGTH(unicode) = length;
1085 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001086 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1087 PyObject_DEL(_PyUnicode_WSTR(unicode));
1088 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001089 if (!PyUnicode_IS_ASCII(unicode))
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001091 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001092#ifdef Py_DEBUG
1093 unicode_fill_invalid(unicode, old_length);
1094#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1096 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098 return unicode;
1099}
1100
Alexander Belopolsky40018472011-02-26 01:02:56 +00001101static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103{
Victor Stinner95663112011-10-04 01:03:50 +02001104 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001108
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 if (PyUnicode_IS_READY(unicode)) {
1110 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001113#ifdef Py_DEBUG
1114 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1115#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116
1117 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001118 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1120 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121
1122 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1123 PyErr_NoMemory();
1124 return -1;
1125 }
1126 new_size = (length + 1) * char_size;
1127
Victor Stinner7a9105a2011-12-12 00:13:42 +01001128 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1129 {
1130 PyObject_DEL(_PyUnicode_UTF8(unicode));
1131 _PyUnicode_UTF8(unicode) = NULL;
1132 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1133 }
1134
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 data = (PyObject *)PyObject_REALLOC(data, new_size);
1136 if (data == NULL) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001141 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001142 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001143 _PyUnicode_WSTR_LENGTH(unicode) = length;
1144 }
1145 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001146 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001147 _PyUnicode_UTF8_LENGTH(unicode) = length;
1148 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 _PyUnicode_LENGTH(unicode) = length;
1150 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001151#ifdef Py_DEBUG
1152 unicode_fill_invalid(unicode, old_length);
1153#endif
Victor Stinner95663112011-10-04 01:03:50 +02001154 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001155 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001158 }
Victor Stinner95663112011-10-04 01:03:50 +02001159 assert(_PyUnicode_WSTR(unicode) != NULL);
1160
1161 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001162 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001163 PyErr_NoMemory();
1164 return -1;
1165 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001167 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001169 if (!wstr) {
1170 PyErr_NoMemory();
1171 return -1;
1172 }
1173 _PyUnicode_WSTR(unicode) = wstr;
1174 _PyUnicode_WSTR(unicode)[length] = 0;
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001176 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 return 0;
1178}
1179
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180static PyObject*
1181resize_copy(PyObject *unicode, Py_ssize_t length)
1182{
1183 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001184 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001187 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188
1189 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1190 if (copy == NULL)
1191 return NULL;
1192
1193 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001194 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001196 }
1197 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001198 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001199
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001200 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (w == NULL)
1202 return NULL;
1203 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1204 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001205 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001206 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001207 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 }
1209}
1210
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001212 Ux0000 terminated; some code (e.g. new_identifier)
1213 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214
1215 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218*/
1219
Alexander Belopolsky40018472011-02-26 01:02:56 +00001220static PyUnicodeObject *
1221_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001223 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225
Thomas Wouters477c8d52006-05-27 19:21:47 +00001226 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001227 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001228 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 }
1230
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001231 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001232 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001233 return (PyUnicodeObject *)PyErr_NoMemory();
1234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 if (length < 0) {
1236 PyErr_SetString(PyExc_SystemError,
1237 "Negative size passed to _PyUnicode_New");
1238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1242 if (unicode == NULL)
1243 return NULL;
1244 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001245
1246 _PyUnicode_WSTR_LENGTH(unicode) = length;
1247 _PyUnicode_HASH(unicode) = -1;
1248 _PyUnicode_STATE(unicode).interned = 0;
1249 _PyUnicode_STATE(unicode).kind = 0;
1250 _PyUnicode_STATE(unicode).compact = 0;
1251 _PyUnicode_STATE(unicode).ready = 0;
1252 _PyUnicode_STATE(unicode).ascii = 0;
1253 _PyUnicode_DATA_ANY(unicode) = NULL;
1254 _PyUnicode_LENGTH(unicode) = 0;
1255 _PyUnicode_UTF8(unicode) = NULL;
1256 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1259 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001260 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001261 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001262 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264
Jeremy Hyltond8082792003-09-16 19:41:39 +00001265 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001266 * the caller fails before initializing str -- unicode_resize()
1267 * reads str[0], and the Keep-Alive optimization can keep memory
1268 * allocated for str alive across a call to unicode_dealloc(unicode).
1269 * We don't want unicode_resize to read uninitialized memory in
1270 * that case.
1271 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode)[0] = 0;
1273 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001274
Victor Stinner7931d9a2011-11-04 00:22:48 +01001275 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 return unicode;
1277}
1278
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279static const char*
1280unicode_kind_name(PyObject *unicode)
1281{
Victor Stinner42dfd712011-10-03 14:41:45 +02001282 /* don't check consistency: unicode_kind_name() is called from
1283 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 if (!PyUnicode_IS_COMPACT(unicode))
1285 {
1286 if (!PyUnicode_IS_READY(unicode))
1287 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 {
1290 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001291 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 return "legacy ascii";
1293 else
1294 return "legacy latin1";
1295 case PyUnicode_2BYTE_KIND:
1296 return "legacy UCS2";
1297 case PyUnicode_4BYTE_KIND:
1298 return "legacy UCS4";
1299 default:
1300 return "<legacy invalid kind>";
1301 }
1302 }
1303 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001304 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001305 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001306 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001307 return "ascii";
1308 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001309 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001310 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001311 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001312 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001313 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001314 default:
1315 return "<invalid compact kind>";
1316 }
1317}
1318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001321const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001322 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001323 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001326const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001327 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return _PyUnicode_COMPACT_DATA(unicode);
1329}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001330const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001331 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001332 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1334 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1335 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1336 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1337 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1338 return PyUnicode_DATA(unicode);
1339}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001340
1341void
1342_PyUnicode_Dump(PyObject *op)
1343{
1344 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1346 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001347 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001350 {
1351 if (ascii->state.ascii)
1352 data = (ascii + 1);
1353 else
1354 data = (compact + 1);
1355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 else
1357 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001358 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001359
Victor Stinnera849a4b2011-10-03 12:12:11 +02001360 if (ascii->wstr == data)
1361 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001362 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001363
Victor Stinnera3b334d2011-10-03 13:53:37 +02001364 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001365 printf(" (%zu), ", compact->wstr_length);
1366 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001367 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001368 }
1369 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001370 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001371 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001372}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373#endif
1374
Victor Stinner91698d82020-06-25 14:07:40 +02001375static int
1376unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1377{
1378 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1379 // optimized to always use state->empty_string without having to check if
1380 // it is NULL or not.
1381 PyObject *empty = PyUnicode_New(1, 0);
1382 if (empty == NULL) {
1383 return -1;
1384 }
1385 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1386 _PyUnicode_LENGTH(empty) = 0;
1387 assert(_PyUnicode_CheckConsistency(empty, 1));
1388
1389 assert(state->empty_string == NULL);
1390 state->empty_string = empty;
1391 return 0;
1392}
1393
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395PyObject *
1396PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1397{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001398 /* Optimization for empty strings */
1399 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001400 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001401 }
1402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *obj;
1404 PyCompactUnicodeObject *unicode;
1405 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001406 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001407 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 Py_ssize_t char_size;
1409 Py_ssize_t struct_size;
1410
Victor Stinner9e9d6892011-10-04 01:02:02 +02001411 is_ascii = 0;
1412 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 struct_size = sizeof(PyCompactUnicodeObject);
1414 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001415 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 char_size = 1;
1417 is_ascii = 1;
1418 struct_size = sizeof(PyASCIIObject);
1419 }
1420 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001421 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 char_size = 1;
1423 }
1424 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001425 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 char_size = 2;
1427 if (sizeof(wchar_t) == 2)
1428 is_sharing = 1;
1429 }
1430 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001431 if (maxchar > MAX_UNICODE) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "invalid maximum character passed to PyUnicode_New");
1434 return NULL;
1435 }
Victor Stinner8f825062012-04-27 13:55:39 +02001436 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 char_size = 4;
1438 if (sizeof(wchar_t) == 4)
1439 is_sharing = 1;
1440 }
1441
1442 /* Ensure we won't overflow the size. */
1443 if (size < 0) {
1444 PyErr_SetString(PyExc_SystemError,
1445 "Negative size passed to PyUnicode_New");
1446 return NULL;
1447 }
1448 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1449 return PyErr_NoMemory();
1450
1451 /* Duplicated allocation code from _PyObject_New() instead of a call to
1452 * PyObject_New() so we are able to allocate space for the object and
1453 * it's data buffer.
1454 */
1455 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001456 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001458 }
1459 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 unicode = (PyCompactUnicodeObject *)obj;
1462 if (is_ascii)
1463 data = ((PyASCIIObject*)obj) + 1;
1464 else
1465 data = unicode + 1;
1466 _PyUnicode_LENGTH(unicode) = size;
1467 _PyUnicode_HASH(unicode) = -1;
1468 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001469 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 _PyUnicode_STATE(unicode).compact = 1;
1471 _PyUnicode_STATE(unicode).ready = 1;
1472 _PyUnicode_STATE(unicode).ascii = is_ascii;
1473 if (is_ascii) {
1474 ((char*)data)[size] = 0;
1475 _PyUnicode_WSTR(unicode) = NULL;
1476 }
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((char*)data)[size] = 0;
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001482 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 else {
1485 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001486 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001487 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001489 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 ((Py_UCS4*)data)[size] = 0;
1491 if (is_sharing) {
1492 _PyUnicode_WSTR_LENGTH(unicode) = size;
1493 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1494 }
1495 else {
1496 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 }
1499 }
Victor Stinner8f825062012-04-27 13:55:39 +02001500#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001501 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001502#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 return obj;
1505}
1506
1507#if SIZEOF_WCHAR_T == 2
1508/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1509 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001510 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 This function assumes that unicode can hold one more code point than wstr
1513 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001514static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
1518 const wchar_t *iter;
1519 Py_UCS4 *ucs4_out;
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 assert(unicode != NULL);
1522 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1524 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1525
1526 for (iter = begin; iter < end; ) {
1527 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1528 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001529 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1530 && (iter+1) < end
1531 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 {
Victor Stinner551ac952011-11-29 22:58:13 +01001533 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 iter += 2;
1535 }
1536 else {
1537 *ucs4_out++ = *iter;
1538 iter++;
1539 }
1540 }
1541 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
1543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544}
1545#endif
1546
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547static int
Victor Stinner488fa492011-12-12 00:01:39 +01001548unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001549{
Victor Stinner488fa492011-12-12 00:01:39 +01001550 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001551 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001552 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001553 return -1;
1554 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001555 return 0;
1556}
1557
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558static int
1559_copy_characters(PyObject *to, Py_ssize_t to_start,
1560 PyObject *from, Py_ssize_t from_start,
1561 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001563 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001564 const void *from_data;
1565 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566
Victor Stinneree4544c2012-05-09 22:24:08 +02001567 assert(0 <= how_many);
1568 assert(0 <= from_start);
1569 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001570 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001572 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerd3f08822012-05-29 12:57:52 +02001574 assert(PyUnicode_Check(to));
1575 assert(PyUnicode_IS_READY(to));
1576 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1577
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 if (how_many == 0)
1579 return 0;
1580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001584 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585
Victor Stinnerf1852262012-06-16 16:38:26 +02001586#ifdef Py_DEBUG
1587 if (!check_maxchar
1588 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1589 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001590 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001591 Py_UCS4 ch;
1592 Py_ssize_t i;
1593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1595 assert(ch <= to_maxchar);
1596 }
1597 }
1598#endif
1599
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001600 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (check_maxchar
1602 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1603 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 /* Writing Latin-1 characters into an ASCII string requires to
1605 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001606 Py_UCS4 max_char;
1607 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001608 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001609 if (max_char >= 128)
1610 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001611 }
Christian Heimesf051e432016-09-13 20:22:02 +02001612 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001613 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001614 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001616 else if (from_kind == PyUnicode_1BYTE_KIND
1617 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS1, Py_UCS2,
1621 PyUnicode_1BYTE_DATA(from) + from_start,
1622 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_2BYTE_DATA(to) + to_start
1624 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001625 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001626 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 && to_kind == PyUnicode_4BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS1, Py_UCS4,
1631 PyUnicode_1BYTE_DATA(from) + from_start,
1632 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_4BYTE_DATA(to) + to_start
1634 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001635 }
1636 else if (from_kind == PyUnicode_2BYTE_KIND
1637 && to_kind == PyUnicode_4BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS2, Py_UCS4,
1641 PyUnicode_2BYTE_DATA(from) + from_start,
1642 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_4BYTE_DATA(to) + to_start
1644 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001645 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1648
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 if (!check_maxchar) {
1650 if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_1BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS1,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_1BYTE_DATA(to) + to_start
1658 );
1659 }
1660 else if (from_kind == PyUnicode_4BYTE_KIND
1661 && to_kind == PyUnicode_1BYTE_KIND)
1662 {
1663 _PyUnicode_CONVERT_BYTES(
1664 Py_UCS4, Py_UCS1,
1665 PyUnicode_4BYTE_DATA(from) + from_start,
1666 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1667 PyUnicode_1BYTE_DATA(to) + to_start
1668 );
1669 }
1670 else if (from_kind == PyUnicode_4BYTE_KIND
1671 && to_kind == PyUnicode_2BYTE_KIND)
1672 {
1673 _PyUnicode_CONVERT_BYTES(
1674 Py_UCS4, Py_UCS2,
1675 PyUnicode_4BYTE_DATA(from) + from_start,
1676 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1677 PyUnicode_2BYTE_DATA(to) + to_start
1678 );
1679 }
1680 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001681 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001682 }
1683 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001684 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 Py_ssize_t i;
1688
Victor Stinnera0702ab2011-09-29 14:14:38 +02001689 for (i=0; i < how_many; i++) {
1690 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001691 if (ch > to_maxchar)
1692 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001693 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1694 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001695 }
1696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 return 0;
1698}
1699
Victor Stinnerd3f08822012-05-29 12:57:52 +02001700void
1701_PyUnicode_FastCopyCharacters(
1702 PyObject *to, Py_ssize_t to_start,
1703 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704{
1705 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1706}
1707
1708Py_ssize_t
1709PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1710 PyObject *from, Py_ssize_t from_start,
1711 Py_ssize_t how_many)
1712{
1713 int err;
1714
1715 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1716 PyErr_BadInternalCall();
1717 return -1;
1718 }
1719
Benjamin Petersonbac79492012-01-14 13:34:47 -05001720 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001721 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001722 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001723 return -1;
1724
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001726 PyErr_SetString(PyExc_IndexError, "string index out of range");
1727 return -1;
1728 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001729 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001730 PyErr_SetString(PyExc_IndexError, "string index out of range");
1731 return -1;
1732 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001733 if (how_many < 0) {
1734 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1735 return -1;
1736 }
1737 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1739 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001740 "Cannot write %zi characters at %zi "
1741 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 how_many, to_start, PyUnicode_GET_LENGTH(to));
1743 return -1;
1744 }
1745
1746 if (how_many == 0)
1747 return 0;
1748
Victor Stinner488fa492011-12-12 00:01:39 +01001749 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001750 return -1;
1751
1752 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1753 if (err) {
1754 PyErr_Format(PyExc_SystemError,
1755 "Cannot copy %s characters "
1756 "into a string of %s characters",
1757 unicode_kind_name(from),
1758 unicode_kind_name(to));
1759 return -1;
1760 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001761 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762}
1763
Victor Stinner17222162011-09-28 22:15:37 +02001764/* Find the maximum code point and count the number of surrogate pairs so a
1765 correct string length can be computed before converting a string to UCS4.
1766 This function counts single surrogates as a character and not as a pair.
1767
1768 Return 0 on success, or -1 on error. */
1769static int
1770find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1771 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772{
1773 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001774 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 *num_surrogates = 0;
1778 *maxchar = 0;
1779
1780 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001782 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1783 && (iter+1) < end
1784 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1785 {
1786 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1787 ++(*num_surrogates);
1788 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
1790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001792 {
1793 ch = *iter;
1794 iter++;
1795 }
1796 if (ch > *maxchar) {
1797 *maxchar = ch;
1798 if (*maxchar > MAX_UNICODE) {
1799 PyErr_Format(PyExc_ValueError,
1800 "character U+%x is not in range [U+0000; U+10ffff]",
1801 ch);
1802 return -1;
1803 }
1804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 return 0;
1807}
1808
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001809int
1810_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811{
1812 wchar_t *end;
1813 Py_UCS4 maxchar = 0;
1814 Py_ssize_t num_surrogates;
1815#if SIZEOF_WCHAR_T == 2
1816 Py_ssize_t length_wo_surrogates;
1817#endif
1818
Georg Brandl7597add2011-10-05 16:36:47 +02001819 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 strings were created using _PyObject_New() and where no canonical
1821 representation (the str field) has been set yet aka strings
1822 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001823 assert(_PyUnicode_CHECK(unicode));
1824 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001827 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001828 /* Actually, it should neither be interned nor be anything else: */
1829 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001832 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001833 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835
1836 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1838 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 PyErr_NoMemory();
1840 return -1;
1841 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001842 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 _PyUnicode_WSTR(unicode), end,
1844 PyUnicode_1BYTE_DATA(unicode));
1845 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1846 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1847 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1848 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001849 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001850 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001851 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001852 }
1853 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 }
1858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861 }
1862 /* In this case we might have to convert down from 4-byte native
1863 wchar_t to 2-byte unicode. */
1864 else if (maxchar < 65536) {
1865 assert(num_surrogates == 0 &&
1866 "FindMaxCharAndNumSurrogatePairs() messed up");
1867
Victor Stinner506f5922011-09-28 22:34:18 +02001868#if SIZEOF_WCHAR_T == 2
1869 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001870 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001871 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1872 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1873 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 _PyUnicode_UTF8(unicode) = NULL;
1875 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001876#else
1877 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001878 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001879 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001881 PyErr_NoMemory();
1882 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Victor Stinner506f5922011-09-28 22:34:18 +02001884 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1885 _PyUnicode_WSTR(unicode), end,
1886 PyUnicode_2BYTE_DATA(unicode));
1887 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1888 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1889 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001890 _PyUnicode_UTF8(unicode) = NULL;
1891 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001892 PyObject_FREE(_PyUnicode_WSTR(unicode));
1893 _PyUnicode_WSTR(unicode) = NULL;
1894 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1895#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 }
1897 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1898 else {
1899#if SIZEOF_WCHAR_T == 2
1900 /* in case the native representation is 2-bytes, we need to allocate a
1901 new normalized 4-byte version. */
1902 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001903 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1904 PyErr_NoMemory();
1905 return -1;
1906 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001907 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1908 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 PyErr_NoMemory();
1910 return -1;
1911 }
1912 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1913 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001916 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1917 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001918 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 PyObject_FREE(_PyUnicode_WSTR(unicode));
1920 _PyUnicode_WSTR(unicode) = NULL;
1921 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1922#else
1923 assert(num_surrogates == 0);
1924
Victor Stinnerc3c74152011-10-02 20:39:55 +02001925 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001927 _PyUnicode_UTF8(unicode) = NULL;
1928 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1930#endif
1931 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1932 }
1933 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001934 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001935 return 0;
1936}
1937
Alexander Belopolsky40018472011-02-26 01:02:56 +00001938static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001939unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940{
Walter Dörwald16807132007-05-25 13:52:07 +00001941 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 case SSTATE_NOT_INTERNED:
1943 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001944
Benjamin Peterson29060642009-01-31 22:14:21 +00001945 case SSTATE_INTERNED_MORTAL:
1946 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001947 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001948#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001949 if (PyDict_DelItem(interned, unicode) != 0) {
1950 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1951 NULL);
1952 }
Victor Stinner607b1022020-05-05 18:50:30 +02001953#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001954 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1958 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001959
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001962 }
1963
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001966 }
1967 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001968 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 }
1970 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001971 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001974 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975}
1976
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977#ifdef Py_DEBUG
1978static int
1979unicode_is_singleton(PyObject *unicode)
1980{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001981 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001982 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001983 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001984 }
Victor Stinner607b1022020-05-05 18:50:30 +02001985 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001986 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1987 {
1988 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001989 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001991 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001992 }
1993 return 0;
1994}
1995#endif
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997static int
Victor Stinner488fa492011-12-12 00:01:39 +01001998unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999{
Victor Stinner488fa492011-12-12 00:01:39 +01002000 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 if (Py_REFCNT(unicode) != 1)
2002 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002003 if (_PyUnicode_HASH(unicode) != -1)
2004 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 if (PyUnicode_CHECK_INTERNED(unicode))
2006 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002007 if (!PyUnicode_CheckExact(unicode))
2008 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002009#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002010 /* singleton refcount is greater than 1 */
2011 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002012#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 return 1;
2014}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002015
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016static int
2017unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2018{
2019 PyObject *unicode;
2020 Py_ssize_t old_length;
2021
2022 assert(p_unicode != NULL);
2023 unicode = *p_unicode;
2024
2025 assert(unicode != NULL);
2026 assert(PyUnicode_Check(unicode));
2027 assert(0 <= length);
2028
Victor Stinner910337b2011-10-03 03:20:16 +02002029 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002030 old_length = PyUnicode_WSTR_LENGTH(unicode);
2031 else
2032 old_length = PyUnicode_GET_LENGTH(unicode);
2033 if (old_length == length)
2034 return 0;
2035
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002036 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002037 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002038 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002039 return 0;
2040 }
2041
Victor Stinner488fa492011-12-12 00:01:39 +01002042 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 PyObject *copy = resize_copy(unicode, length);
2044 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002046 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048 }
2049
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002051 PyObject *new_unicode = resize_compact(unicode, length);
2052 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002054 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002057 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002058}
2059
Alexander Belopolsky40018472011-02-26 01:02:56 +00002060int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002061PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002062{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 PyObject *unicode;
2064 if (p_unicode == NULL) {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002069 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 {
2071 PyErr_BadInternalCall();
2072 return -1;
2073 }
2074 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002075}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002076
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002077/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002078
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002079 WARNING: The function doesn't copy the terminating null character and
2080 doesn't check the maximum character (may write a latin1 character in an
2081 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002082static void
2083unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2084 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002085{
2086 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002087 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002090 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002091 switch (kind) {
2092 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002093#ifdef Py_DEBUG
2094 if (PyUnicode_IS_ASCII(unicode)) {
2095 Py_UCS4 maxchar = ucs1lib_find_max_char(
2096 (const Py_UCS1*)str,
2097 (const Py_UCS1*)str + len);
2098 assert(maxchar < 128);
2099 }
2100#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002101 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002102 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 }
2104 case PyUnicode_2BYTE_KIND: {
2105 Py_UCS2 *start = (Py_UCS2 *)data + index;
2106 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107
Victor Stinner184252a2012-06-16 02:57:41 +02002108 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 *ucs2 = (Py_UCS2)*str;
2110
2111 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002114 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 Py_UCS4 *start = (Py_UCS4 *)data + index;
2116 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117
Victor Stinner184252a2012-06-16 02:57:41 +02002118 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 *ucs4 = (Py_UCS4)*str;
2120
2121 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002122 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002124 default:
2125 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 }
2127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002130get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002132 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002133
Victor Stinner2f9ada92020-06-24 02:22:21 +02002134 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002135 if (unicode) {
2136 Py_INCREF(unicode);
2137 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 }
Victor Stinner607b1022020-05-05 18:50:30 +02002139
2140 unicode = PyUnicode_New(1, ch);
2141 if (!unicode) {
2142 return NULL;
2143 }
2144
2145 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2146 assert(_PyUnicode_CheckConsistency(unicode, 1));
2147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002149 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002150 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151}
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153static PyObject*
2154unicode_char(Py_UCS4 ch)
2155{
2156 PyObject *unicode;
2157
2158 assert(ch <= MAX_UNICODE);
2159
Victor Stinner2f9ada92020-06-24 02:22:21 +02002160 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002161 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002162 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002163
Victor Stinner985a82a2014-01-03 12:53:47 +01002164 unicode = PyUnicode_New(1, ch);
2165 if (unicode == NULL)
2166 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002167
2168 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2169 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002170 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002171 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002172 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2173 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2174 }
2175 assert(_PyUnicode_CheckConsistency(unicode, 1));
2176 return unicode;
2177}
2178
Alexander Belopolsky40018472011-02-26 01:02:56 +00002179PyObject *
2180PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002182 if (u == NULL) {
2183 if (size > 0) {
2184 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2185 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2186 "use PyUnicode_New() instead", 1) < 0) {
2187 return NULL;
2188 }
2189 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002190 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002191 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002192
2193 if (size < 0) {
2194 PyErr_BadInternalCall();
2195 return NULL;
2196 }
2197
2198 return PyUnicode_FromWideChar(u, size);
2199}
2200
2201PyObject *
2202PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2203{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002204 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 Py_UCS4 maxchar = 0;
2206 Py_ssize_t num_surrogates;
2207
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002208 if (u == NULL && size != 0) {
2209 PyErr_BadInternalCall();
2210 return NULL;
2211 }
2212
2213 if (size == -1) {
2214 size = wcslen(u);
2215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002217 /* If the Unicode data is known at construction time, we can apply
2218 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002221 if (size == 0)
2222 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 /* Single character Unicode objects in the Latin-1 range are
2225 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002226 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 return get_latin1_char((unsigned char)*u);
2228
2229 /* If not empty and not single character, copy the Unicode data
2230 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002231 if (find_maxchar_surrogates(u, u + size,
2232 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 return NULL;
2234
Victor Stinner8faf8212011-12-08 22:14:11 +01002235 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 if (!unicode)
2237 return NULL;
2238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 switch (PyUnicode_KIND(unicode)) {
2240 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002241 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2243 break;
2244 case PyUnicode_2BYTE_KIND:
2245#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002246 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002248 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2250#endif
2251 break;
2252 case PyUnicode_4BYTE_KIND:
2253#if SIZEOF_WCHAR_T == 2
2254 /* This is the only case which has to process surrogates, thus
2255 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002256 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257#else
2258 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002259 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260#endif
2261 break;
2262 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002263 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002266 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267}
2268
Alexander Belopolsky40018472011-02-26 01:02:56 +00002269PyObject *
2270PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002271{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002272 if (size < 0) {
2273 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002274 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002275 return NULL;
2276 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002277 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002278 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002279 }
2280 else {
2281 if (size > 0) {
2282 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2283 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2284 "use PyUnicode_New() instead", 1) < 0) {
2285 return NULL;
2286 }
2287 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002288 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002289 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002290}
2291
Alexander Belopolsky40018472011-02-26 01:02:56 +00002292PyObject *
2293PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002294{
2295 size_t size = strlen(u);
2296 if (size > PY_SSIZE_T_MAX) {
2297 PyErr_SetString(PyExc_OverflowError, "input too long");
2298 return NULL;
2299 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002300 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002301}
2302
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002303PyObject *
2304_PyUnicode_FromId(_Py_Identifier *id)
2305{
Victor Stinner297257f2020-06-02 14:39:45 +02002306 if (id->object) {
2307 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002308 }
Victor Stinner297257f2020-06-02 14:39:45 +02002309
2310 PyObject *obj;
2311 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2312 strlen(id->string),
2313 NULL, NULL);
2314 if (!obj) {
2315 return NULL;
2316 }
2317 PyUnicode_InternInPlace(&obj);
2318
2319 assert(!id->next);
2320 id->object = obj;
2321 id->next = static_strings;
2322 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002323 return id->object;
2324}
2325
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002326static void
2327unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002328{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002329 _Py_Identifier *tmp, *s = static_strings;
2330 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002331 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002332 tmp = s->next;
2333 s->next = NULL;
2334 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002335 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002336 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002337}
2338
Benjamin Peterson0df54292012-03-26 14:50:32 -04002339/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002340
Victor Stinnerd3f08822012-05-29 12:57:52 +02002341PyObject*
2342_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002343{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002344 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002345 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002346 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002347#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002348 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002349#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002350 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002351 }
Victor Stinner785938e2011-12-11 20:09:03 +01002352 unicode = PyUnicode_New(size, 127);
2353 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002354 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002355 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2356 assert(_PyUnicode_CheckConsistency(unicode, 1));
2357 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002358}
2359
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002360static Py_UCS4
2361kind_maxchar_limit(unsigned int kind)
2362{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002363 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002364 case PyUnicode_1BYTE_KIND:
2365 return 0x80;
2366 case PyUnicode_2BYTE_KIND:
2367 return 0x100;
2368 case PyUnicode_4BYTE_KIND:
2369 return 0x10000;
2370 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002371 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002372 }
2373}
2374
Victor Stinner702c7342011-10-05 13:50:52 +02002375static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002376_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002379 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002380
Victor Stinner2f9ada92020-06-24 02:22:21 +02002381 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002382 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002383 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002384 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002385 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002386 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002387 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002388
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002389 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002390 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 if (!res)
2392 return NULL;
2393 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002394 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002396}
2397
Victor Stinnere57b1c02011-09-28 22:20:48 +02002398static PyObject*
2399_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400{
2401 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002402 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002403
Serhiy Storchaka678db842013-01-26 12:16:36 +02002404 if (size == 0)
2405 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002406 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002407 if (size == 1)
2408 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002409
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002410 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002411 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 if (!res)
2413 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002414 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002416 else {
2417 _PyUnicode_CONVERT_BYTES(
2418 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2419 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002420 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 return res;
2422}
2423
Victor Stinnere57b1c02011-09-28 22:20:48 +02002424static PyObject*
2425_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426{
2427 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002428 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002429
Serhiy Storchaka678db842013-01-26 12:16:36 +02002430 if (size == 0)
2431 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002432 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002433 if (size == 1)
2434 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002436 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002437 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 if (!res)
2439 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002440 if (max_char < 256)
2441 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2442 PyUnicode_1BYTE_DATA(res));
2443 else if (max_char < 0x10000)
2444 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2445 PyUnicode_2BYTE_DATA(res));
2446 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002448 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 return res;
2450}
2451
2452PyObject*
2453PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2454{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002455 if (size < 0) {
2456 PyErr_SetString(PyExc_ValueError, "size must be positive");
2457 return NULL;
2458 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002459 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002461 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002463 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002465 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002466 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002467 PyErr_SetString(PyExc_SystemError, "invalid kind");
2468 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002470}
2471
Victor Stinnerece58de2012-04-23 23:36:38 +02002472Py_UCS4
2473_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2474{
2475 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002476 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002477
2478 assert(PyUnicode_IS_READY(unicode));
2479 assert(0 <= start);
2480 assert(end <= PyUnicode_GET_LENGTH(unicode));
2481 assert(start <= end);
2482
2483 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2484 return PyUnicode_MAX_CHAR_VALUE(unicode);
2485
2486 if (start == end)
2487 return 127;
2488
Victor Stinner94d558b2012-04-27 22:26:58 +02002489 if (PyUnicode_IS_ASCII(unicode))
2490 return 127;
2491
Victor Stinnerece58de2012-04-23 23:36:38 +02002492 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002493 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002494 endptr = (char *)startptr + end * kind;
2495 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002496 switch(kind) {
2497 case PyUnicode_1BYTE_KIND:
2498 return ucs1lib_find_max_char(startptr, endptr);
2499 case PyUnicode_2BYTE_KIND:
2500 return ucs2lib_find_max_char(startptr, endptr);
2501 case PyUnicode_4BYTE_KIND:
2502 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002503 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002504 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002505 }
2506}
2507
Victor Stinner25a4b292011-10-06 12:31:55 +02002508/* Ensure that a string uses the most efficient storage, if it is not the
2509 case: create a new string with of the right kind. Write NULL into *p_unicode
2510 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002511static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002512unicode_adjust_maxchar(PyObject **p_unicode)
2513{
2514 PyObject *unicode, *copy;
2515 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002516 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002517 unsigned int kind;
2518
2519 assert(p_unicode != NULL);
2520 unicode = *p_unicode;
2521 assert(PyUnicode_IS_READY(unicode));
2522 if (PyUnicode_IS_ASCII(unicode))
2523 return;
2524
2525 len = PyUnicode_GET_LENGTH(unicode);
2526 kind = PyUnicode_KIND(unicode);
2527 if (kind == PyUnicode_1BYTE_KIND) {
2528 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002529 max_char = ucs1lib_find_max_char(u, u + len);
2530 if (max_char >= 128)
2531 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002532 }
2533 else if (kind == PyUnicode_2BYTE_KIND) {
2534 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002535 max_char = ucs2lib_find_max_char(u, u + len);
2536 if (max_char >= 256)
2537 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002538 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002539 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002540 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002541 max_char = ucs4lib_find_max_char(u, u + len);
2542 if (max_char >= 0x10000)
2543 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002544 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002545 else
2546 Py_UNREACHABLE();
2547
Victor Stinner25a4b292011-10-06 12:31:55 +02002548 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002549 if (copy != NULL)
2550 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002551 Py_DECREF(unicode);
2552 *p_unicode = copy;
2553}
2554
Victor Stinner034f6cf2011-09-30 02:26:44 +02002555PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002556_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002557{
Victor Stinner87af4f22011-11-21 23:03:47 +01002558 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002559 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002560
Victor Stinner034f6cf2011-09-30 02:26:44 +02002561 if (!PyUnicode_Check(unicode)) {
2562 PyErr_BadInternalCall();
2563 return NULL;
2564 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002565 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002566 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002567
Victor Stinner87af4f22011-11-21 23:03:47 +01002568 length = PyUnicode_GET_LENGTH(unicode);
2569 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002570 if (!copy)
2571 return NULL;
2572 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2573
Christian Heimesf051e432016-09-13 20:22:02 +02002574 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002575 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002576 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002577 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002578}
2579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580
Victor Stinnerbc603d12011-10-02 01:00:40 +02002581/* Widen Unicode objects to larger buffers. Don't write terminating null
2582 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002583
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002584static void*
2585unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002587 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002588
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002589 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002590 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002591 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002592 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002593 if (!result)
2594 return PyErr_NoMemory();
2595 assert(skind == PyUnicode_1BYTE_KIND);
2596 _PyUnicode_CONVERT_BYTES(
2597 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002598 (const Py_UCS1 *)data,
2599 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002600 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002602 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002603 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002604 if (!result)
2605 return PyErr_NoMemory();
2606 if (skind == PyUnicode_2BYTE_KIND) {
2607 _PyUnicode_CONVERT_BYTES(
2608 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002609 (const Py_UCS2 *)data,
2610 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002611 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002613 else {
2614 assert(skind == PyUnicode_1BYTE_KIND);
2615 _PyUnicode_CONVERT_BYTES(
2616 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002617 (const Py_UCS1 *)data,
2618 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002619 result);
2620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002622 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002623 Py_UNREACHABLE();
2624 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626}
2627
2628static Py_UCS4*
2629as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2630 int copy_null)
2631{
2632 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002633 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 Py_ssize_t len, targetlen;
2635 if (PyUnicode_READY(string) == -1)
2636 return NULL;
2637 kind = PyUnicode_KIND(string);
2638 data = PyUnicode_DATA(string);
2639 len = PyUnicode_GET_LENGTH(string);
2640 targetlen = len;
2641 if (copy_null)
2642 targetlen++;
2643 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002644 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 if (!target) {
2646 PyErr_NoMemory();
2647 return NULL;
2648 }
2649 }
2650 else {
2651 if (targetsize < targetlen) {
2652 PyErr_Format(PyExc_SystemError,
2653 "string is longer than the buffer");
2654 if (copy_null && 0 < targetsize)
2655 target[0] = 0;
2656 return NULL;
2657 }
2658 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002659 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002660 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002661 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002662 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002663 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002664 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002665 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2666 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002667 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002668 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002669 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002670 else {
2671 Py_UNREACHABLE();
2672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 if (copy_null)
2674 target[len] = 0;
2675 return target;
2676}
2677
2678Py_UCS4*
2679PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2680 int copy_null)
2681{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002682 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 PyErr_BadInternalCall();
2684 return NULL;
2685 }
2686 return as_ucs4(string, target, targetsize, copy_null);
2687}
2688
2689Py_UCS4*
2690PyUnicode_AsUCS4Copy(PyObject *string)
2691{
2692 return as_ucs4(string, NULL, 0, 1);
2693}
2694
Victor Stinner15a11362012-10-06 23:48:20 +02002695/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002696 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2697 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2698#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002699
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002700static int
2701unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2702 Py_ssize_t width, Py_ssize_t precision)
2703{
2704 Py_ssize_t length, fill, arglen;
2705 Py_UCS4 maxchar;
2706
2707 if (PyUnicode_READY(str) == -1)
2708 return -1;
2709
2710 length = PyUnicode_GET_LENGTH(str);
2711 if ((precision == -1 || precision >= length)
2712 && width <= length)
2713 return _PyUnicodeWriter_WriteStr(writer, str);
2714
2715 if (precision != -1)
2716 length = Py_MIN(precision, length);
2717
2718 arglen = Py_MAX(length, width);
2719 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2720 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2721 else
2722 maxchar = writer->maxchar;
2723
2724 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2725 return -1;
2726
2727 if (width > length) {
2728 fill = width - length;
2729 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2730 return -1;
2731 writer->pos += fill;
2732 }
2733
2734 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2735 str, 0, length);
2736 writer->pos += length;
2737 return 0;
2738}
2739
2740static int
Victor Stinner998b8062018-09-12 00:23:25 +02002741unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002742 Py_ssize_t width, Py_ssize_t precision)
2743{
2744 /* UTF-8 */
2745 Py_ssize_t length;
2746 PyObject *unicode;
2747 int res;
2748
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002749 if (precision == -1) {
2750 length = strlen(str);
2751 }
2752 else {
2753 length = 0;
2754 while (length < precision && str[length]) {
2755 length++;
2756 }
2757 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002758 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2759 if (unicode == NULL)
2760 return -1;
2761
2762 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2763 Py_DECREF(unicode);
2764 return res;
2765}
2766
Victor Stinner96865452011-03-01 23:44:09 +00002767static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002768unicode_fromformat_arg(_PyUnicodeWriter *writer,
2769 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002770{
Victor Stinnere215d962012-10-06 23:03:36 +02002771 const char *p;
2772 Py_ssize_t len;
2773 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002774 Py_ssize_t width;
2775 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002776 int longflag;
2777 int longlongflag;
2778 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002780
2781 p = f;
2782 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002783 zeropad = 0;
2784 if (*f == '0') {
2785 zeropad = 1;
2786 f++;
2787 }
Victor Stinner96865452011-03-01 23:44:09 +00002788
2789 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 width = -1;
2791 if (Py_ISDIGIT((unsigned)*f)) {
2792 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002793 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002796 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002797 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002798 return NULL;
2799 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002800 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002801 f++;
2802 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803 }
2804 precision = -1;
2805 if (*f == '.') {
2806 f++;
2807 if (Py_ISDIGIT((unsigned)*f)) {
2808 precision = (*f - '0');
2809 f++;
2810 while (Py_ISDIGIT((unsigned)*f)) {
2811 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2812 PyErr_SetString(PyExc_ValueError,
2813 "precision too big");
2814 return NULL;
2815 }
2816 precision = (precision * 10) + (*f - '0');
2817 f++;
2818 }
2819 }
Victor Stinner96865452011-03-01 23:44:09 +00002820 if (*f == '%') {
2821 /* "%.3%s" => f points to "3" */
2822 f--;
2823 }
2824 }
2825 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002826 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002827 f--;
2828 }
Victor Stinner96865452011-03-01 23:44:09 +00002829
2830 /* Handle %ld, %lu, %lld and %llu. */
2831 longflag = 0;
2832 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002833 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002834 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002835 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002836 longflag = 1;
2837 ++f;
2838 }
Victor Stinner96865452011-03-01 23:44:09 +00002839 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002840 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002841 longlongflag = 1;
2842 f += 2;
2843 }
Victor Stinner96865452011-03-01 23:44:09 +00002844 }
2845 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002846 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002847 size_tflag = 1;
2848 ++f;
2849 }
Victor Stinnere215d962012-10-06 23:03:36 +02002850
2851 if (f[1] == '\0')
2852 writer->overallocate = 0;
2853
2854 switch (*f) {
2855 case 'c':
2856 {
2857 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002858 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002859 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002860 "character argument not in range(0x110000)");
2861 return NULL;
2862 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002863 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002864 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002865 break;
2866 }
2867
2868 case 'i':
2869 case 'd':
2870 case 'u':
2871 case 'x':
2872 {
2873 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002874 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002875 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002876
2877 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002878 if (longflag) {
2879 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2880 }
2881 else if (longlongflag) {
2882 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2883 }
2884 else if (size_tflag) {
2885 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2886 }
2887 else {
2888 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2889 }
Victor Stinnere215d962012-10-06 23:03:36 +02002890 }
2891 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002892 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002893 }
2894 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002895 if (longflag) {
2896 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2897 }
2898 else if (longlongflag) {
2899 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2900 }
2901 else if (size_tflag) {
2902 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2903 }
2904 else {
2905 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2906 }
Victor Stinnere215d962012-10-06 23:03:36 +02002907 }
2908 assert(len >= 0);
2909
Victor Stinnere215d962012-10-06 23:03:36 +02002910 if (precision < len)
2911 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002912
2913 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2915 return NULL;
2916
Victor Stinnere215d962012-10-06 23:03:36 +02002917 if (width > precision) {
2918 Py_UCS4 fillchar;
2919 fill = width - precision;
2920 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002921 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2922 return NULL;
2923 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002924 }
Victor Stinner15a11362012-10-06 23:48:20 +02002925 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002926 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002927 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2928 return NULL;
2929 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002930 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002931
Victor Stinner4a587072013-11-19 12:54:53 +01002932 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2933 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002934 break;
2935 }
2936
2937 case 'p':
2938 {
2939 char number[MAX_LONG_LONG_CHARS];
2940
2941 len = sprintf(number, "%p", va_arg(*vargs, void*));
2942 assert(len >= 0);
2943
2944 /* %p is ill-defined: ensure leading 0x. */
2945 if (number[1] == 'X')
2946 number[1] = 'x';
2947 else if (number[1] != 'x') {
2948 memmove(number + 2, number,
2949 strlen(number) + 1);
2950 number[0] = '0';
2951 number[1] = 'x';
2952 len += 2;
2953 }
2954
Victor Stinner4a587072013-11-19 12:54:53 +01002955 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002956 return NULL;
2957 break;
2958 }
2959
2960 case 's':
2961 {
2962 /* UTF-8 */
2963 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002964 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002965 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002966 break;
2967 }
2968
2969 case 'U':
2970 {
2971 PyObject *obj = va_arg(*vargs, PyObject *);
2972 assert(obj && _PyUnicode_CHECK(obj));
2973
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002974 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002975 return NULL;
2976 break;
2977 }
2978
2979 case 'V':
2980 {
2981 PyObject *obj = va_arg(*vargs, PyObject *);
2982 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002983 if (obj) {
2984 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002985 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002986 return NULL;
2987 }
2988 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002989 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002990 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002991 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002992 }
2993 break;
2994 }
2995
2996 case 'S':
2997 {
2998 PyObject *obj = va_arg(*vargs, PyObject *);
2999 PyObject *str;
3000 assert(obj);
3001 str = PyObject_Str(obj);
3002 if (!str)
3003 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003004 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003005 Py_DECREF(str);
3006 return NULL;
3007 }
3008 Py_DECREF(str);
3009 break;
3010 }
3011
3012 case 'R':
3013 {
3014 PyObject *obj = va_arg(*vargs, PyObject *);
3015 PyObject *repr;
3016 assert(obj);
3017 repr = PyObject_Repr(obj);
3018 if (!repr)
3019 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003020 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003021 Py_DECREF(repr);
3022 return NULL;
3023 }
3024 Py_DECREF(repr);
3025 break;
3026 }
3027
3028 case 'A':
3029 {
3030 PyObject *obj = va_arg(*vargs, PyObject *);
3031 PyObject *ascii;
3032 assert(obj);
3033 ascii = PyObject_ASCII(obj);
3034 if (!ascii)
3035 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003036 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003037 Py_DECREF(ascii);
3038 return NULL;
3039 }
3040 Py_DECREF(ascii);
3041 break;
3042 }
3043
3044 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003045 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003046 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003047 break;
3048
3049 default:
3050 /* if we stumble upon an unknown formatting code, copy the rest
3051 of the format string to the output string. (we cannot just
3052 skip the code, since there's no way to know what's in the
3053 argument list) */
3054 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003055 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003056 return NULL;
3057 f = p+len;
3058 return f;
3059 }
3060
3061 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003062 return f;
3063}
3064
Walter Dörwaldd2034312007-05-18 16:29:38 +00003065PyObject *
3066PyUnicode_FromFormatV(const char *format, va_list vargs)
3067{
Victor Stinnere215d962012-10-06 23:03:36 +02003068 va_list vargs2;
3069 const char *f;
3070 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003071
Victor Stinner8f674cc2013-04-17 23:02:17 +02003072 _PyUnicodeWriter_Init(&writer);
3073 writer.min_length = strlen(format) + 100;
3074 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003075
Benjamin Peterson0c212142016-09-20 20:39:33 -07003076 // Copy varags to be able to pass a reference to a subfunction.
3077 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003078
3079 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003080 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003081 f = unicode_fromformat_arg(&writer, f, &vargs2);
3082 if (f == NULL)
3083 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003085 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003086 const char *p;
3087 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003088
Victor Stinnere215d962012-10-06 23:03:36 +02003089 p = f;
3090 do
3091 {
3092 if ((unsigned char)*p > 127) {
3093 PyErr_Format(PyExc_ValueError,
3094 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3095 "string, got a non-ASCII byte: 0x%02x",
3096 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003097 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003098 }
3099 p++;
3100 }
3101 while (*p != '\0' && *p != '%');
3102 len = p - f;
3103
3104 if (*p == '\0')
3105 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003106
3107 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003108 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003109
3110 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003111 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003112 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003113 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003114 return _PyUnicodeWriter_Finish(&writer);
3115
3116 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003117 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003118 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003119 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003120}
3121
Walter Dörwaldd2034312007-05-18 16:29:38 +00003122PyObject *
3123PyUnicode_FromFormat(const char *format, ...)
3124{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003125 PyObject* ret;
3126 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127
3128#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003129 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003130#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003131 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003132#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003133 ret = PyUnicode_FromFormatV(format, vargs);
3134 va_end(vargs);
3135 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003136}
3137
Serhiy Storchakac46db922018-10-23 22:58:24 +03003138static Py_ssize_t
3139unicode_get_widechar_size(PyObject *unicode)
3140{
3141 Py_ssize_t res;
3142
3143 assert(unicode != NULL);
3144 assert(_PyUnicode_CHECK(unicode));
3145
3146 if (_PyUnicode_WSTR(unicode) != NULL) {
3147 return PyUnicode_WSTR_LENGTH(unicode);
3148 }
3149 assert(PyUnicode_IS_READY(unicode));
3150
3151 res = _PyUnicode_LENGTH(unicode);
3152#if SIZEOF_WCHAR_T == 2
3153 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3154 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3155 const Py_UCS4 *end = s + res;
3156 for (; s < end; ++s) {
3157 if (*s > 0xFFFF) {
3158 ++res;
3159 }
3160 }
3161 }
3162#endif
3163 return res;
3164}
3165
3166static void
3167unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3168{
3169 const wchar_t *wstr;
3170
3171 assert(unicode != NULL);
3172 assert(_PyUnicode_CHECK(unicode));
3173
3174 wstr = _PyUnicode_WSTR(unicode);
3175 if (wstr != NULL) {
3176 memcpy(w, wstr, size * sizeof(wchar_t));
3177 return;
3178 }
3179 assert(PyUnicode_IS_READY(unicode));
3180
3181 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3182 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3183 for (; size--; ++s, ++w) {
3184 *w = *s;
3185 }
3186 }
3187 else {
3188#if SIZEOF_WCHAR_T == 4
3189 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3190 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3191 for (; size--; ++s, ++w) {
3192 *w = *s;
3193 }
3194#else
3195 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3196 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3197 for (; size--; ++s, ++w) {
3198 Py_UCS4 ch = *s;
3199 if (ch > 0xFFFF) {
3200 assert(ch <= MAX_UNICODE);
3201 /* encode surrogate pair in this case */
3202 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3203 if (!size--)
3204 break;
3205 *w = Py_UNICODE_LOW_SURROGATE(ch);
3206 }
3207 else {
3208 *w = ch;
3209 }
3210 }
3211#endif
3212 }
3213}
3214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003215#ifdef HAVE_WCHAR_H
3216
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003217/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003218
Victor Stinnerd88d9832011-09-06 02:00:05 +02003219 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003220 character) required to convert the unicode object. Ignore size argument.
3221
Victor Stinnerd88d9832011-09-06 02:00:05 +02003222 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003223 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003224 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003225Py_ssize_t
3226PyUnicode_AsWideChar(PyObject *unicode,
3227 wchar_t *w,
3228 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003229{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003230 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003231
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003232 if (unicode == NULL) {
3233 PyErr_BadInternalCall();
3234 return -1;
3235 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003236 if (!PyUnicode_Check(unicode)) {
3237 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003238 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003239 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003240
3241 res = unicode_get_widechar_size(unicode);
3242 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003243 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003244 }
3245
3246 if (size > res) {
3247 size = res + 1;
3248 }
3249 else {
3250 res = size;
3251 }
3252 unicode_copy_as_widechar(unicode, w, size);
3253 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003254}
3255
Victor Stinner137c34c2010-09-29 10:25:54 +00003256wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003257PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003258 Py_ssize_t *size)
3259{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003260 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003261 Py_ssize_t buflen;
3262
3263 if (unicode == NULL) {
3264 PyErr_BadInternalCall();
3265 return NULL;
3266 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003267 if (!PyUnicode_Check(unicode)) {
3268 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003269 return NULL;
3270 }
3271
Serhiy Storchakac46db922018-10-23 22:58:24 +03003272 buflen = unicode_get_widechar_size(unicode);
3273 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003274 if (buffer == NULL) {
3275 PyErr_NoMemory();
3276 return NULL;
3277 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003278 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3279 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003280 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003281 }
3282 else if (wcslen(buffer) != (size_t)buflen) {
3283 PyMem_FREE(buffer);
3284 PyErr_SetString(PyExc_ValueError,
3285 "embedded null character");
3286 return NULL;
3287 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003288 return buffer;
3289}
3290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003291#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003293int
3294_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3295{
3296 wchar_t **p = (wchar_t **)ptr;
3297 if (obj == NULL) {
3298#if !USE_UNICODE_WCHAR_CACHE
3299 PyMem_Free(*p);
3300#endif /* USE_UNICODE_WCHAR_CACHE */
3301 *p = NULL;
3302 return 1;
3303 }
3304 if (PyUnicode_Check(obj)) {
3305#if USE_UNICODE_WCHAR_CACHE
3306_Py_COMP_DIAG_PUSH
3307_Py_COMP_DIAG_IGNORE_DEPR_DECLS
3308 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3309 if (*p == NULL) {
3310 return 0;
3311 }
3312 return 1;
3313_Py_COMP_DIAG_POP
3314#else /* USE_UNICODE_WCHAR_CACHE */
3315 *p = PyUnicode_AsWideCharString(obj, NULL);
3316 if (*p == NULL) {
3317 return 0;
3318 }
3319 return Py_CLEANUP_SUPPORTED;
3320#endif /* USE_UNICODE_WCHAR_CACHE */
3321 }
3322 PyErr_Format(PyExc_TypeError,
3323 "argument must be str, not %.50s",
3324 obj->ob_type->tp_name);
3325 return 0;
3326}
3327
3328int
3329_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3330{
3331 wchar_t **p = (wchar_t **)ptr;
3332 if (obj == NULL) {
3333#if !USE_UNICODE_WCHAR_CACHE
3334 PyMem_Free(*p);
3335#endif /* USE_UNICODE_WCHAR_CACHE */
3336 *p = NULL;
3337 return 1;
3338 }
3339 if (obj == Py_None) {
3340 *p = NULL;
3341 return 1;
3342 }
3343 if (PyUnicode_Check(obj)) {
3344#if USE_UNICODE_WCHAR_CACHE
3345_Py_COMP_DIAG_PUSH
3346_Py_COMP_DIAG_IGNORE_DEPR_DECLS
3347 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3348 if (*p == NULL) {
3349 return 0;
3350 }
3351 return 1;
3352_Py_COMP_DIAG_POP
3353#else /* USE_UNICODE_WCHAR_CACHE */
3354 *p = PyUnicode_AsWideCharString(obj, NULL);
3355 if (*p == NULL) {
3356 return 0;
3357 }
3358 return Py_CLEANUP_SUPPORTED;
3359#endif /* USE_UNICODE_WCHAR_CACHE */
3360 }
3361 PyErr_Format(PyExc_TypeError,
3362 "argument must be str or None, not %.50s",
3363 obj->ob_type->tp_name);
3364 return 0;
3365}
3366
Alexander Belopolsky40018472011-02-26 01:02:56 +00003367PyObject *
3368PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003369{
Victor Stinner8faf8212011-12-08 22:14:11 +01003370 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003371 PyErr_SetString(PyExc_ValueError,
3372 "chr() arg not in range(0x110000)");
3373 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003374 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003375
Victor Stinner985a82a2014-01-03 12:53:47 +01003376 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003377}
3378
Alexander Belopolsky40018472011-02-26 01:02:56 +00003379PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003380PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003382 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003383 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003384 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003385 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003386 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 Py_INCREF(obj);
3388 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003389 }
3390 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 /* For a Unicode subtype that's not a Unicode object,
3392 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003393 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003395 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003396 "Can't convert '%.100s' object to str implicitly",
3397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003399}
3400
Alexander Belopolsky40018472011-02-26 01:02:56 +00003401PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003402PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003403 const char *encoding,
3404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003408
Guido van Rossumd57fd912000-03-10 22:53:23 +00003409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 PyErr_BadInternalCall();
3411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003414 /* Decoding bytes objects is the most common case and should be fast */
3415 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003416 if (PyBytes_GET_SIZE(obj) == 0) {
3417 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3418 return NULL;
3419 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003420 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003421 }
3422 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003423 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3424 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003425 }
3426
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003427 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 PyErr_SetString(PyExc_TypeError,
3429 "decoding str is not supported");
3430 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003431 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003432
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003433 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3434 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3435 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003436 "decoding to str: need a bytes-like object, %.80s found",
3437 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003439 }
Tim Petersced69f82003-09-16 20:30:58 +00003440
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003441 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003442 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003443 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3444 return NULL;
3445 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003446 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003448
Serhiy Storchaka05997252013-01-26 12:14:02 +02003449 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452}
3453
Victor Stinnerebe17e02016-10-12 13:57:45 +02003454/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3455 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3456 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003457int
3458_Py_normalize_encoding(const char *encoding,
3459 char *lower,
3460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003463 char *l;
3464 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003465 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003466
Victor Stinner942889a2016-09-05 15:40:10 -07003467 assert(encoding != NULL);
3468
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003469 e = encoding;
3470 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003471 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003472 punct = 0;
3473 while (1) {
3474 char c = *e;
3475 if (c == 0) {
3476 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003477 }
Victor Stinner942889a2016-09-05 15:40:10 -07003478
3479 if (Py_ISALNUM(c) || c == '.') {
3480 if (punct && l != lower) {
3481 if (l == l_end) {
3482 return 0;
3483 }
3484 *l++ = '_';
3485 }
3486 punct = 0;
3487
3488 if (l == l_end) {
3489 return 0;
3490 }
3491 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003492 }
3493 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003494 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003495 }
Victor Stinner942889a2016-09-05 15:40:10 -07003496
3497 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003498 }
3499 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003500 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003501}
3502
Alexander Belopolsky40018472011-02-26 01:02:56 +00003503PyObject *
3504PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003505 Py_ssize_t size,
3506 const char *encoding,
3507 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003508{
3509 PyObject *buffer = NULL, *unicode;
3510 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003511 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3512
Victor Stinner22eb6892019-06-26 00:51:05 +02003513 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3514 return NULL;
3515 }
3516
Victor Stinnered076ed2019-06-26 01:49:32 +02003517 if (size == 0) {
3518 _Py_RETURN_UNICODE_EMPTY();
3519 }
3520
Victor Stinner942889a2016-09-05 15:40:10 -07003521 if (encoding == NULL) {
3522 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3523 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003524
Fred Drakee4315f52000-05-09 19:53:39 +00003525 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003526 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3527 char *lower = buflower;
3528
3529 /* Fast paths */
3530 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3531 lower += 3;
3532 if (*lower == '_') {
3533 /* Match "utf8" and "utf_8" */
3534 lower++;
3535 }
3536
3537 if (lower[0] == '8' && lower[1] == 0) {
3538 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3539 }
3540 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3541 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3542 }
3543 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3544 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3545 }
3546 }
3547 else {
3548 if (strcmp(lower, "ascii") == 0
3549 || strcmp(lower, "us_ascii") == 0) {
3550 return PyUnicode_DecodeASCII(s, size, errors);
3551 }
Steve Dowercc16be82016-09-08 10:35:16 -07003552 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003553 else if (strcmp(lower, "mbcs") == 0) {
3554 return PyUnicode_DecodeMBCS(s, size, errors);
3555 }
3556 #endif
3557 else if (strcmp(lower, "latin1") == 0
3558 || strcmp(lower, "latin_1") == 0
3559 || strcmp(lower, "iso_8859_1") == 0
3560 || strcmp(lower, "iso8859_1") == 0) {
3561 return PyUnicode_DecodeLatin1(s, size, errors);
3562 }
3563 }
Victor Stinner37296e82010-06-10 13:36:23 +00003564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565
3566 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003567 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003568 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003569 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003570 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 if (buffer == NULL)
3572 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003573 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 if (unicode == NULL)
3575 goto onError;
3576 if (!PyUnicode_Check(unicode)) {
3577 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003578 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003579 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003580 encoding,
3581 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582 Py_DECREF(unicode);
3583 goto onError;
3584 }
3585 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003586 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003587
Benjamin Peterson29060642009-01-31 22:14:21 +00003588 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 Py_XDECREF(buffer);
3590 return NULL;
3591}
3592
Alexander Belopolsky40018472011-02-26 01:02:56 +00003593PyObject *
3594PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003595 const char *encoding,
3596 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003597{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003598 if (!PyUnicode_Check(unicode)) {
3599 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003600 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003601 }
3602
Serhiy Storchaka00939072016-10-27 21:05:49 +03003603 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3604 "PyUnicode_AsDecodedObject() is deprecated; "
3605 "use PyCodec_Decode() to decode from str", 1) < 0)
3606 return NULL;
3607
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003608 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003609 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003610
3611 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003612 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003613}
3614
Alexander Belopolsky40018472011-02-26 01:02:56 +00003615PyObject *
3616PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003617 const char *encoding,
3618 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003619{
3620 PyObject *v;
3621
3622 if (!PyUnicode_Check(unicode)) {
3623 PyErr_BadArgument();
3624 goto onError;
3625 }
3626
Serhiy Storchaka00939072016-10-27 21:05:49 +03003627 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3628 "PyUnicode_AsDecodedUnicode() is deprecated; "
3629 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3630 return NULL;
3631
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003632 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003634
3635 /* Decode via the codec registry */
3636 v = PyCodec_Decode(unicode, encoding, errors);
3637 if (v == NULL)
3638 goto onError;
3639 if (!PyUnicode_Check(v)) {
3640 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003641 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003642 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003643 encoding,
3644 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003645 Py_DECREF(v);
3646 goto onError;
3647 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003648 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003649
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003651 return NULL;
3652}
3653
Alexander Belopolsky40018472011-02-26 01:02:56 +00003654PyObject *
3655PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003656 Py_ssize_t size,
3657 const char *encoding,
3658 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659{
3660 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003661
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003662 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3666 Py_DECREF(unicode);
3667 return v;
3668}
3669
Alexander Belopolsky40018472011-02-26 01:02:56 +00003670PyObject *
3671PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003672 const char *encoding,
3673 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003674{
3675 PyObject *v;
3676
3677 if (!PyUnicode_Check(unicode)) {
3678 PyErr_BadArgument();
3679 goto onError;
3680 }
3681
Serhiy Storchaka00939072016-10-27 21:05:49 +03003682 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3683 "PyUnicode_AsEncodedObject() is deprecated; "
3684 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3685 "or PyCodec_Encode() for generic encoding", 1) < 0)
3686 return NULL;
3687
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003688 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003690
3691 /* Encode via the codec registry */
3692 v = PyCodec_Encode(unicode, encoding, errors);
3693 if (v == NULL)
3694 goto onError;
3695 return v;
3696
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003698 return NULL;
3699}
3700
Victor Stinner1b579672011-12-17 05:47:23 +01003701
Victor Stinner2cba6b82018-01-10 22:46:15 +01003702static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003703unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003704 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003705{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003706 Py_ssize_t wlen;
3707 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3708 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003709 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003710 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003711
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003712 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003713 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003714 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003715 return NULL;
3716 }
3717
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003718 char *str;
3719 size_t error_pos;
3720 const char *reason;
3721 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003722 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003723 PyMem_Free(wstr);
3724
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003725 if (res != 0) {
3726 if (res == -2) {
3727 PyObject *exc;
3728 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3729 "locale", unicode,
3730 (Py_ssize_t)error_pos,
3731 (Py_ssize_t)(error_pos+1),
3732 reason);
3733 if (exc != NULL) {
3734 PyCodec_StrictErrors(exc);
3735 Py_DECREF(exc);
3736 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003737 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003738 else if (res == -3) {
3739 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3740 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003741 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003742 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003743 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003744 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003745 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003746
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003747 PyObject *bytes = PyBytes_FromString(str);
3748 PyMem_RawFree(str);
3749 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003750}
3751
Victor Stinnerad158722010-10-27 00:25:46 +00003752PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003753PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3754{
Victor Stinner709d23d2019-05-02 14:56:30 -04003755 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3756 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003757}
3758
3759PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003760PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003761{
Victor Stinner81a7be32020-04-14 15:14:01 +02003762 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003763 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3764 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003765 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003766 fs_codec->error_handler,
3767 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003768 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003769#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003770 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003771 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003772 fs_codec->encoding,
3773 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003774 }
Victor Stinnerad158722010-10-27 00:25:46 +00003775#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003776 else {
3777 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3778 machinery is not ready and so cannot be used:
3779 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003780 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3781 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003782 assert(filesystem_errors != NULL);
3783 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3784 assert(errors != _Py_ERROR_UNKNOWN);
3785#ifdef _Py_FORCE_UTF8_FS_ENCODING
3786 return unicode_encode_utf8(unicode, errors, NULL);
3787#else
3788 return unicode_encode_locale(unicode, errors, 0);
3789#endif
3790 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003791}
3792
Alexander Belopolsky40018472011-02-26 01:02:56 +00003793PyObject *
3794PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003795 const char *encoding,
3796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797{
3798 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003799 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003800
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 if (!PyUnicode_Check(unicode)) {
3802 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003803 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 }
Fred Drakee4315f52000-05-09 19:53:39 +00003805
Victor Stinner22eb6892019-06-26 00:51:05 +02003806 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3807 return NULL;
3808 }
3809
Victor Stinner942889a2016-09-05 15:40:10 -07003810 if (encoding == NULL) {
3811 return _PyUnicode_AsUTF8String(unicode, errors);
3812 }
3813
Fred Drakee4315f52000-05-09 19:53:39 +00003814 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003815 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3816 char *lower = buflower;
3817
3818 /* Fast paths */
3819 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3820 lower += 3;
3821 if (*lower == '_') {
3822 /* Match "utf8" and "utf_8" */
3823 lower++;
3824 }
3825
3826 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003827 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003828 }
3829 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3830 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3831 }
3832 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3833 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3834 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003835 }
Victor Stinner942889a2016-09-05 15:40:10 -07003836 else {
3837 if (strcmp(lower, "ascii") == 0
3838 || strcmp(lower, "us_ascii") == 0) {
3839 return _PyUnicode_AsASCIIString(unicode, errors);
3840 }
Steve Dowercc16be82016-09-08 10:35:16 -07003841#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003842 else if (strcmp(lower, "mbcs") == 0) {
3843 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3844 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003845#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003846 else if (strcmp(lower, "latin1") == 0 ||
3847 strcmp(lower, "latin_1") == 0 ||
3848 strcmp(lower, "iso_8859_1") == 0 ||
3849 strcmp(lower, "iso8859_1") == 0) {
3850 return _PyUnicode_AsLatin1String(unicode, errors);
3851 }
3852 }
Victor Stinner37296e82010-06-10 13:36:23 +00003853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854
3855 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003856 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003858 return NULL;
3859
3860 /* The normal path */
3861 if (PyBytes_Check(v))
3862 return v;
3863
3864 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003865 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003866 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003867 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003868
3869 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003870 "encoder %s returned bytearray instead of bytes; "
3871 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003872 encoding);
3873 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003874 Py_DECREF(v);
3875 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003876 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003877
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003878 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3879 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003880 Py_DECREF(v);
3881 return b;
3882 }
3883
3884 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003885 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003886 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003887 encoding,
3888 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003889 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003890 return NULL;
3891}
3892
Alexander Belopolsky40018472011-02-26 01:02:56 +00003893PyObject *
3894PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003895 const char *encoding,
3896 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003897{
3898 PyObject *v;
3899
3900 if (!PyUnicode_Check(unicode)) {
3901 PyErr_BadArgument();
3902 goto onError;
3903 }
3904
Serhiy Storchaka00939072016-10-27 21:05:49 +03003905 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3906 "PyUnicode_AsEncodedUnicode() is deprecated; "
3907 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3908 return NULL;
3909
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003910 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003911 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003912
3913 /* Encode via the codec registry */
3914 v = PyCodec_Encode(unicode, encoding, errors);
3915 if (v == NULL)
3916 goto onError;
3917 if (!PyUnicode_Check(v)) {
3918 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003919 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003920 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003921 encoding,
3922 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003923 Py_DECREF(v);
3924 goto onError;
3925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003927
Benjamin Peterson29060642009-01-31 22:14:21 +00003928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 return NULL;
3930}
3931
Victor Stinner2cba6b82018-01-10 22:46:15 +01003932static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003933unicode_decode_locale(const char *str, Py_ssize_t len,
3934 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003935{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003936 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3937 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003938 return NULL;
3939 }
3940
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003941 wchar_t *wstr;
3942 size_t wlen;
3943 const char *reason;
3944 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003945 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003946 if (res != 0) {
3947 if (res == -2) {
3948 PyObject *exc;
3949 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3950 "locale", str, len,
3951 (Py_ssize_t)wlen,
3952 (Py_ssize_t)(wlen + 1),
3953 reason);
3954 if (exc != NULL) {
3955 PyCodec_StrictErrors(exc);
3956 Py_DECREF(exc);
3957 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003958 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003959 else if (res == -3) {
3960 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3961 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003962 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003963 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003964 }
Victor Stinner2f197072011-12-17 07:08:30 +01003965 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003966 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003967
3968 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3969 PyMem_RawFree(wstr);
3970 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003971}
3972
3973PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003974PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3975 const char *errors)
3976{
Victor Stinner709d23d2019-05-02 14:56:30 -04003977 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3978 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003979}
3980
3981PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003982PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003983{
3984 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003985 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3986 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003987}
3988
3989
3990PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003991PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003992 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003993 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3994}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003995
Christian Heimes5894ba72007-11-04 11:43:14 +00003996PyObject*
3997PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3998{
Victor Stinner81a7be32020-04-14 15:14:01 +02003999 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004000 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4001 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004002 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004003 fs_codec->error_handler,
4004 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004005 NULL);
4006 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004007#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004008 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004009 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004010 fs_codec->encoding,
4011 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004012 }
Victor Stinnerad158722010-10-27 00:25:46 +00004013#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004014 else {
4015 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4016 machinery is not ready and so cannot be used:
4017 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004018 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4019 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004020 assert(filesystem_errors != NULL);
4021 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4022 assert(errors != _Py_ERROR_UNKNOWN);
4023#ifdef _Py_FORCE_UTF8_FS_ENCODING
4024 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4025#else
4026 return unicode_decode_locale(s, size, errors, 0);
4027#endif
4028 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004029}
4030
Martin v. Löwis011e8422009-05-05 04:43:17 +00004031
4032int
4033PyUnicode_FSConverter(PyObject* arg, void* addr)
4034{
Brett Cannonec6ce872016-09-06 15:50:29 -07004035 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004036 PyObject *output = NULL;
4037 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004038 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004039 if (arg == NULL) {
4040 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004041 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004042 return 1;
4043 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004044 path = PyOS_FSPath(arg);
4045 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004046 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004047 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004048 if (PyBytes_Check(path)) {
4049 output = path;
4050 }
4051 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4052 output = PyUnicode_EncodeFSDefault(path);
4053 Py_DECREF(path);
4054 if (!output) {
4055 return 0;
4056 }
4057 assert(PyBytes_Check(output));
4058 }
4059
Victor Stinner0ea2a462010-04-30 00:22:08 +00004060 size = PyBytes_GET_SIZE(output);
4061 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004062 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004063 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004064 Py_DECREF(output);
4065 return 0;
4066 }
4067 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004068 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004069}
4070
4071
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004072int
4073PyUnicode_FSDecoder(PyObject* arg, void* addr)
4074{
Brett Cannona5711202016-09-06 19:36:01 -07004075 int is_buffer = 0;
4076 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004077 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004078 if (arg == NULL) {
4079 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004080 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004081 return 1;
4082 }
Brett Cannona5711202016-09-06 19:36:01 -07004083
4084 is_buffer = PyObject_CheckBuffer(arg);
4085 if (!is_buffer) {
4086 path = PyOS_FSPath(arg);
4087 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004088 return 0;
4089 }
Brett Cannona5711202016-09-06 19:36:01 -07004090 }
4091 else {
4092 path = arg;
4093 Py_INCREF(arg);
4094 }
4095
4096 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004097 output = path;
4098 }
4099 else if (PyBytes_Check(path) || is_buffer) {
4100 PyObject *path_bytes = NULL;
4101
4102 if (!PyBytes_Check(path) &&
4103 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004104 "path should be string, bytes, or os.PathLike, not %.200s",
4105 Py_TYPE(arg)->tp_name)) {
4106 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004107 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004108 }
4109 path_bytes = PyBytes_FromObject(path);
4110 Py_DECREF(path);
4111 if (!path_bytes) {
4112 return 0;
4113 }
4114 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4115 PyBytes_GET_SIZE(path_bytes));
4116 Py_DECREF(path_bytes);
4117 if (!output) {
4118 return 0;
4119 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004120 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004121 else {
4122 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004123 "path should be string, bytes, or os.PathLike, not %.200s",
4124 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004125 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004126 return 0;
4127 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004128 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004129 Py_DECREF(output);
4130 return 0;
4131 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004133 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004134 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004135 Py_DECREF(output);
4136 return 0;
4137 }
4138 *(PyObject**)addr = output;
4139 return Py_CLEANUP_SUPPORTED;
4140}
4141
4142
Inada Naoki02a4d572020-02-27 13:48:59 +09004143static int unicode_fill_utf8(PyObject *unicode);
4144
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004145const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004147{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004148 if (!PyUnicode_Check(unicode)) {
4149 PyErr_BadArgument();
4150 return NULL;
4151 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004152 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004153 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004155 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004156 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157 return NULL;
4158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 }
4160
4161 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004162 *psize = PyUnicode_UTF8_LENGTH(unicode);
4163 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004164}
4165
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004166const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004167PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004169 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4170}
4171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172Py_UNICODE *
4173PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4174{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004175 if (!PyUnicode_Check(unicode)) {
4176 PyErr_BadArgument();
4177 return NULL;
4178 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004179 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4180 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004181 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004182 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004183 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184
Serhiy Storchakac46db922018-10-23 22:58:24 +03004185 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4186 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4187 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004188 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004190 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4191 if (w == NULL) {
4192 PyErr_NoMemory();
4193 return NULL;
4194 }
4195 unicode_copy_as_widechar(unicode, w, wlen + 1);
4196 _PyUnicode_WSTR(unicode) = w;
4197 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4198 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004199 }
4200 }
4201 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004202 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004203 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004204}
4205
Inada Naoki2c4928d2020-06-17 20:09:44 +09004206/* Deprecated APIs */
4207
4208_Py_COMP_DIAG_PUSH
4209_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4210
Alexander Belopolsky40018472011-02-26 01:02:56 +00004211Py_UNICODE *
4212PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215}
4216
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004217const Py_UNICODE *
4218_PyUnicode_AsUnicode(PyObject *unicode)
4219{
4220 Py_ssize_t size;
4221 const Py_UNICODE *wstr;
4222
4223 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4224 if (wstr && wcslen(wstr) != (size_t)size) {
4225 PyErr_SetString(PyExc_ValueError, "embedded null character");
4226 return NULL;
4227 }
4228 return wstr;
4229}
4230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231
Alexander Belopolsky40018472011-02-26 01:02:56 +00004232Py_ssize_t
4233PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234{
4235 if (!PyUnicode_Check(unicode)) {
4236 PyErr_BadArgument();
4237 goto onError;
4238 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004239 if (_PyUnicode_WSTR(unicode) == NULL) {
4240 if (PyUnicode_AsUnicode(unicode) == NULL)
4241 goto onError;
4242 }
4243 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246 return -1;
4247}
4248
Inada Naoki2c4928d2020-06-17 20:09:44 +09004249_Py_COMP_DIAG_POP
4250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004251Py_ssize_t
4252PyUnicode_GetLength(PyObject *unicode)
4253{
Victor Stinner07621332012-06-16 04:53:46 +02004254 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004255 PyErr_BadArgument();
4256 return -1;
4257 }
Victor Stinner07621332012-06-16 04:53:46 +02004258 if (PyUnicode_READY(unicode) == -1)
4259 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 return PyUnicode_GET_LENGTH(unicode);
4261}
4262
4263Py_UCS4
4264PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4265{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004266 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004267 int kind;
4268
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004269 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004270 PyErr_BadArgument();
4271 return (Py_UCS4)-1;
4272 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004273 if (PyUnicode_READY(unicode) == -1) {
4274 return (Py_UCS4)-1;
4275 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004276 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004277 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 return (Py_UCS4)-1;
4279 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004280 data = PyUnicode_DATA(unicode);
4281 kind = PyUnicode_KIND(unicode);
4282 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004283}
4284
4285int
4286PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4287{
4288 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004289 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290 return -1;
4291 }
Victor Stinner488fa492011-12-12 00:01:39 +01004292 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004293 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004294 PyErr_SetString(PyExc_IndexError, "string index out of range");
4295 return -1;
4296 }
Victor Stinner488fa492011-12-12 00:01:39 +01004297 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004298 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004299 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4300 PyErr_SetString(PyExc_ValueError, "character out of range");
4301 return -1;
4302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004303 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4304 index, ch);
4305 return 0;
4306}
4307
Alexander Belopolsky40018472011-02-26 01:02:56 +00004308const char *
4309PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004310{
Victor Stinner42cb4622010-09-01 19:39:01 +00004311 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004312}
4313
Victor Stinner554f3f02010-06-16 23:33:54 +00004314/* create or adjust a UnicodeDecodeError */
4315static void
4316make_decode_exception(PyObject **exceptionObject,
4317 const char *encoding,
4318 const char *input, Py_ssize_t length,
4319 Py_ssize_t startpos, Py_ssize_t endpos,
4320 const char *reason)
4321{
4322 if (*exceptionObject == NULL) {
4323 *exceptionObject = PyUnicodeDecodeError_Create(
4324 encoding, input, length, startpos, endpos, reason);
4325 }
4326 else {
4327 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4328 goto onError;
4329 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4330 goto onError;
4331 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4332 goto onError;
4333 }
4334 return;
4335
4336onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004337 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004338}
4339
Steve Dowercc16be82016-09-08 10:35:16 -07004340#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004341static int
4342widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4343{
4344 if (newsize > *size) {
4345 wchar_t *newbuf = *buf;
4346 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4347 PyErr_NoMemory();
4348 return -1;
4349 }
4350 *buf = newbuf;
4351 }
4352 *size = newsize;
4353 return 0;
4354}
4355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356/* error handling callback helper:
4357 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004358 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359 and adjust various state variables.
4360 return 0 on success, -1 on error
4361*/
4362
Alexander Belopolsky40018472011-02-26 01:02:56 +00004363static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364unicode_decode_call_errorhandler_wchar(
4365 const char *errors, PyObject **errorHandler,
4366 const char *encoding, const char *reason,
4367 const char **input, const char **inend, Py_ssize_t *startinpos,
4368 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004369 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004371 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372
4373 PyObject *restuple = NULL;
4374 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004375 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004376 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004377 Py_ssize_t requiredsize;
4378 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004379 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004380 wchar_t *repwstr;
4381 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004382
4383 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 *errorHandler = PyCodec_LookupError(errors);
4385 if (*errorHandler == NULL)
4386 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004387 }
4388
Victor Stinner554f3f02010-06-16 23:33:54 +00004389 make_decode_exception(exceptionObject,
4390 encoding,
4391 *input, *inend - *input,
4392 *startinpos, *endinpos,
4393 reason);
4394 if (*exceptionObject == NULL)
4395 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396
Petr Viktorinffd97532020-02-11 17:46:57 +01004397 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004401 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004404 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406
4407 /* Copy back the bytes variables, which might have been modified by the
4408 callback */
4409 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4410 if (!inputobj)
4411 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004412 *input = PyBytes_AS_STRING(inputobj);
4413 insize = PyBytes_GET_SIZE(inputobj);
4414 *inend = *input + insize;
4415 /* we can DECREF safely, as the exception has another reference,
4416 so the object won't go away. */
4417 Py_DECREF(inputobj);
4418
4419 if (newpos<0)
4420 newpos = insize+newpos;
4421 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004422 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004423 goto onError;
4424 }
4425
4426 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4427 if (repwstr == NULL)
4428 goto onError;
4429 /* need more space? (at least enough for what we
4430 have+the replacement+the rest of the string (starting
4431 at the new input position), so we won't have to check space
4432 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004433 requiredsize = *outpos;
4434 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4435 goto overflow;
4436 requiredsize += repwlen;
4437 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4438 goto overflow;
4439 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004440 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004441 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004442 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004444 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004446 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004447 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004448 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004449 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 *endinpos = newpos;
4451 *inptr = *input + newpos;
4452
4453 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004454 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004455 return 0;
4456
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004457 overflow:
4458 PyErr_SetString(PyExc_OverflowError,
4459 "decoded result is too long for a Python string");
4460
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004461 onError:
4462 Py_XDECREF(restuple);
4463 return -1;
4464}
Steve Dowercc16be82016-09-08 10:35:16 -07004465#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004466
4467static int
4468unicode_decode_call_errorhandler_writer(
4469 const char *errors, PyObject **errorHandler,
4470 const char *encoding, const char *reason,
4471 const char **input, const char **inend, Py_ssize_t *startinpos,
4472 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4473 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4474{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004475 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004476
4477 PyObject *restuple = NULL;
4478 PyObject *repunicode = NULL;
4479 Py_ssize_t insize;
4480 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004481 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004482 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004484 int need_to_grow = 0;
4485 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004486
4487 if (*errorHandler == NULL) {
4488 *errorHandler = PyCodec_LookupError(errors);
4489 if (*errorHandler == NULL)
4490 goto onError;
4491 }
4492
4493 make_decode_exception(exceptionObject,
4494 encoding,
4495 *input, *inend - *input,
4496 *startinpos, *endinpos,
4497 reason);
4498 if (*exceptionObject == NULL)
4499 goto onError;
4500
Petr Viktorinffd97532020-02-11 17:46:57 +01004501 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004502 if (restuple == NULL)
4503 goto onError;
4504 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004505 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004506 goto onError;
4507 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004508 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004509 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004510
4511 /* Copy back the bytes variables, which might have been modified by the
4512 callback */
4513 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4514 if (!inputobj)
4515 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004516 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004517 *input = PyBytes_AS_STRING(inputobj);
4518 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004519 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004520 /* we can DECREF safely, as the exception has another reference,
4521 so the object won't go away. */
4522 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004523
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004526 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004527 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530
Victor Stinner170ca6f2013-04-18 00:25:28 +02004531 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004532 if (replen > 1) {
4533 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004534 need_to_grow = 1;
4535 }
4536 new_inptr = *input + newpos;
4537 if (*inend - new_inptr > remain) {
4538 /* We don't know the decoding algorithm here so we make the worst
4539 assumption that one byte decodes to one unicode character.
4540 If unfortunately one byte could decode to more unicode characters,
4541 the decoder may write out-of-bound then. Is it possible for the
4542 algorithms using this function? */
4543 writer->min_length += *inend - new_inptr - remain;
4544 need_to_grow = 1;
4545 }
4546 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004547 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004548 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004549 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4550 goto onError;
4551 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004552 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004553 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004556 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004559 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004560 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565}
4566
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567/* --- UTF-7 Codec -------------------------------------------------------- */
4568
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569/* See RFC2152 for details. We encode conservatively and decode liberally. */
4570
4571/* Three simple macros defining base-64. */
4572
4573/* Is c a base-64 character? */
4574
4575#define IS_BASE64(c) \
4576 (((c) >= 'A' && (c) <= 'Z') || \
4577 ((c) >= 'a' && (c) <= 'z') || \
4578 ((c) >= '0' && (c) <= '9') || \
4579 (c) == '+' || (c) == '/')
4580
4581/* given that c is a base-64 character, what is its base-64 value? */
4582
4583#define FROM_BASE64(c) \
4584 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4585 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4586 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4587 (c) == '+' ? 62 : 63)
4588
4589/* What is the base-64 character of the bottom 6 bits of n? */
4590
4591#define TO_BASE64(n) \
4592 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4593
4594/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4595 * decoded as itself. We are permissive on decoding; the only ASCII
4596 * byte not decoding to itself is the + which begins a base64
4597 * string. */
4598
4599#define DECODE_DIRECT(c) \
4600 ((c) <= 127 && (c) != '+')
4601
4602/* The UTF-7 encoder treats ASCII characters differently according to
4603 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4604 * the above). See RFC2152. This array identifies these different
4605 * sets:
4606 * 0 : "Set D"
4607 * alphanumeric and '(),-./:?
4608 * 1 : "Set O"
4609 * !"#$%&*;<=>@[]^_`{|}
4610 * 2 : "whitespace"
4611 * ht nl cr sp
4612 * 3 : special (must be base64 encoded)
4613 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4614 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615
Tim Petersced69f82003-09-16 20:30:58 +00004616static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617char utf7_category[128] = {
4618/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4619 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4620/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4622/* sp ! " # $ % & ' ( ) * + , - . / */
4623 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4624/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4625 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4626/* @ A B C D E F G H I J K L M N O */
4627 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4628/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4630/* ` a b c d e f g h i j k l m n o */
4631 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4632/* p q r s t u v w x y z { | } ~ del */
4633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634};
4635
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636/* ENCODE_DIRECT: this character should be encoded as itself. The
4637 * answer depends on whether we are encoding set O as itself, and also
4638 * on whether we are encoding whitespace as itself. RFC2152 makes it
4639 * clear that the answers to these questions vary between
4640 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004641
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642#define ENCODE_DIRECT(c, directO, directWS) \
4643 ((c) < 128 && (c) > 0 && \
4644 ((utf7_category[(c)] == 0) || \
4645 (directWS && (utf7_category[(c)] == 2)) || \
4646 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647
Alexander Belopolsky40018472011-02-26 01:02:56 +00004648PyObject *
4649PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004650 Py_ssize_t size,
4651 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004653 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4654}
4655
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656/* The decoder. The only state we preserve is our read position,
4657 * i.e. how many characters we have consumed. So if we end in the
4658 * middle of a shift sequence we have to back off the read position
4659 * and the output to the beginning of the sequence, otherwise we lose
4660 * all the shift state (seen bits, number of bits seen, high
4661 * surrogate). */
4662
Alexander Belopolsky40018472011-02-26 01:02:56 +00004663PyObject *
4664PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004665 Py_ssize_t size,
4666 const char *errors,
4667 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004669 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004670 Py_ssize_t startinpos;
4671 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004672 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004674 const char *errmsg = "";
4675 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004676 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 unsigned int base64bits = 0;
4678 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004679 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680 PyObject *errorHandler = NULL;
4681 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004683 if (size == 0) {
4684 if (consumed)
4685 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004686 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004687 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004689 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004690 _PyUnicodeWriter_Init(&writer);
4691 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004692
4693 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694 e = s + size;
4695
4696 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004697 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004699 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004700
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 if (inShift) { /* in a base-64 section */
4702 if (IS_BASE64(ch)) { /* consume a base-64 character */
4703 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4704 base64bits += 6;
4705 s++;
4706 if (base64bits >= 16) {
4707 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004708 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004709 base64bits -= 16;
4710 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004711 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 if (surrogate) {
4713 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004714 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4715 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004716 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004717 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004718 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004719 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004720 }
4721 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004722 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004723 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 }
4726 }
Victor Stinner551ac952011-11-29 22:58:13 +01004727 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004728 /* first surrogate */
4729 surrogate = outCh;
4730 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004731 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004732 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004733 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004734 }
4735 }
4736 }
4737 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004739 if (base64bits > 0) { /* left-over bits */
4740 if (base64bits >= 6) {
4741 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004742 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004743 errmsg = "partial character in shift sequence";
4744 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 else {
4747 /* Some bits remain; they should be zero */
4748 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004749 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004750 errmsg = "non-zero padding bits in shift sequence";
4751 goto utf7Error;
4752 }
4753 }
4754 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004755 if (surrogate && DECODE_DIRECT(ch)) {
4756 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4757 goto onError;
4758 }
4759 surrogate = 0;
4760 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004761 /* '-' is absorbed; other terminating
4762 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004763 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004764 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 }
4766 }
4767 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769 s++; /* consume '+' */
4770 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004772 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004773 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004775 else if (s < e && !IS_BASE64(*s)) {
4776 s++;
4777 errmsg = "ill-formed sequence";
4778 goto utf7Error;
4779 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004780 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004781 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004782 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004783 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004784 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004785 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 }
4787 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004789 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004790 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004791 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004792 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 else {
4794 startinpos = s-starts;
4795 s++;
4796 errmsg = "unexpected special character";
4797 goto utf7Error;
4798 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004799 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004802 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 errors, &errorHandler,
4804 "utf7", errmsg,
4805 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004806 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004807 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004808 }
4809
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 /* end of string */
4811
4812 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4813 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004814 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 if (surrogate ||
4816 (base64bits >= 6) ||
4817 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004818 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004819 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004820 errors, &errorHandler,
4821 "utf7", "unterminated shift sequence",
4822 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004823 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004824 goto onError;
4825 if (s < e)
4826 goto restart;
4827 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004829
4830 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004831 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004833 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004834 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004835 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004836 writer.kind, writer.data, shiftOutStart);
4837 Py_XDECREF(errorHandler);
4838 Py_XDECREF(exc);
4839 _PyUnicodeWriter_Dealloc(&writer);
4840 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004841 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004842 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004843 }
4844 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004845 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004846 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004847 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 Py_XDECREF(errorHandler);
4850 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004851 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004852
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004854 Py_XDECREF(errorHandler);
4855 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004856 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004857 return NULL;
4858}
4859
4860
Alexander Belopolsky40018472011-02-26 01:02:56 +00004861PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004862_PyUnicode_EncodeUTF7(PyObject *str,
4863 int base64SetO,
4864 int base64WhiteSpace,
4865 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004866{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004867 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004868 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004869 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004870 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004871 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004872 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004873 unsigned int base64bits = 0;
4874 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004875 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004876 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004877
Benjamin Petersonbac79492012-01-14 13:34:47 -05004878 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004879 return NULL;
4880 kind = PyUnicode_KIND(str);
4881 data = PyUnicode_DATA(str);
4882 len = PyUnicode_GET_LENGTH(str);
4883
4884 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004886
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004887 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004888 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004889 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004890 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004891 if (v == NULL)
4892 return NULL;
4893
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004894 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004895 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004896 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004897
Antoine Pitrou244651a2009-05-04 18:56:13 +00004898 if (inShift) {
4899 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4900 /* shifting out */
4901 if (base64bits) { /* output remaining bits */
4902 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4903 base64buffer = 0;
4904 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004905 }
4906 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004907 /* Characters not in the BASE64 set implicitly unshift the sequence
4908 so no '-' is required, except if the character is itself a '-' */
4909 if (IS_BASE64(ch) || ch == '-') {
4910 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004911 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004912 *out++ = (char) ch;
4913 }
4914 else {
4915 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004916 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004917 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004918 else { /* not in a shift sequence */
4919 if (ch == '+') {
4920 *out++ = '+';
4921 *out++ = '-';
4922 }
4923 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4924 *out++ = (char) ch;
4925 }
4926 else {
4927 *out++ = '+';
4928 inShift = 1;
4929 goto encode_char;
4930 }
4931 }
4932 continue;
4933encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004934 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004935 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004936
Antoine Pitrou244651a2009-05-04 18:56:13 +00004937 /* code first surrogate */
4938 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004939 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004940 while (base64bits >= 6) {
4941 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4942 base64bits -= 6;
4943 }
4944 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004945 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004946 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004947 base64bits += 16;
4948 base64buffer = (base64buffer << 16) | ch;
4949 while (base64bits >= 6) {
4950 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4951 base64bits -= 6;
4952 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004953 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004954 if (base64bits)
4955 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4956 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004957 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004958 if (_PyBytes_Resize(&v, out - start) < 0)
4959 return NULL;
4960 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004961}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004962PyObject *
4963PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4964 Py_ssize_t size,
4965 int base64SetO,
4966 int base64WhiteSpace,
4967 const char *errors)
4968{
4969 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004970 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004971 if (tmp == NULL)
4972 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004973 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004974 base64WhiteSpace, errors);
4975 Py_DECREF(tmp);
4976 return result;
4977}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004978
Antoine Pitrou244651a2009-05-04 18:56:13 +00004979#undef IS_BASE64
4980#undef FROM_BASE64
4981#undef TO_BASE64
4982#undef DECODE_DIRECT
4983#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004984
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985/* --- UTF-8 Codec -------------------------------------------------------- */
4986
Alexander Belopolsky40018472011-02-26 01:02:56 +00004987PyObject *
4988PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004989 Py_ssize_t size,
4990 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991{
Walter Dörwald69652032004-09-07 20:24:22 +00004992 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4993}
4994
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995#include "stringlib/asciilib.h"
4996#include "stringlib/codecs.h"
4997#include "stringlib/undef.h"
4998
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004999#include "stringlib/ucs1lib.h"
5000#include "stringlib/codecs.h"
5001#include "stringlib/undef.h"
5002
5003#include "stringlib/ucs2lib.h"
5004#include "stringlib/codecs.h"
5005#include "stringlib/undef.h"
5006
5007#include "stringlib/ucs4lib.h"
5008#include "stringlib/codecs.h"
5009#include "stringlib/undef.h"
5010
Antoine Pitrouab868312009-01-10 15:40:25 +00005011/* Mask to quickly check whether a C 'long' contains a
5012 non-ASCII, UTF8-encoded char. */
5013#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005014# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005015#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02005016# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00005017#else
5018# error C 'long' size should be either 4 or 8!
5019#endif
5020
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021static Py_ssize_t
5022ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005023{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005025 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005026
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005027 /*
5028 * Issue #17237: m68k is a bit different from most architectures in
5029 * that objects do not use "natural alignment" - for example, int and
5030 * long are only aligned at 2-byte boundaries. Therefore the assert()
5031 * won't work; also, tests have shown that skipping the "optimised
5032 * version" will even speed up m68k.
5033 */
5034#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005035#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005036 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
5037 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005038 /* Fast path, see in STRINGLIB(utf8_decode) for
5039 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005040 /* Help allocation */
5041 const char *_p = p;
5042 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 while (_p < aligned_end) {
5044 unsigned long value = *(const unsigned long *) _p;
5045 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005047 *((unsigned long *)q) = value;
5048 _p += SIZEOF_LONG;
5049 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005050 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005051 p = _p;
5052 while (p < end) {
5053 if ((unsigned char)*p & 0x80)
5054 break;
5055 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005060#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005061 while (p < end) {
5062 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5063 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005064 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005065 /* Help allocation */
5066 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005067 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06005068 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069 if (value & ASCII_CHAR_MASK)
5070 break;
5071 _p += SIZEOF_LONG;
5072 }
5073 p = _p;
5074 if (_p == end)
5075 break;
5076 }
5077 if ((unsigned char)*p & 0x80)
5078 break;
5079 ++p;
5080 }
5081 memcpy(dest, start, p - start);
5082 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083}
Antoine Pitrouab868312009-01-10 15:40:25 +00005084
Victor Stinner709d23d2019-05-02 14:56:30 -04005085static PyObject *
5086unicode_decode_utf8(const char *s, Py_ssize_t size,
5087 _Py_error_handler error_handler, const char *errors,
5088 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005089{
Victor Stinner785938e2011-12-11 20:09:03 +01005090 if (size == 0) {
5091 if (consumed)
5092 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005093 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005094 }
5095
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005096 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5097 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005098 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005100 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005102 }
5103
Inada Naoki770847a2019-06-24 12:30:24 +09005104 const char *starts = s;
5105 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005106
Inada Naoki770847a2019-06-24 12:30:24 +09005107 // fast path: try ASCII string.
5108 PyObject *u = PyUnicode_New(size, 127);
5109 if (u == NULL) {
5110 return NULL;
5111 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005112 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005113 if (s == end) {
5114 return u;
5115 }
5116
5117 // Use _PyUnicodeWriter after fast path is failed.
5118 _PyUnicodeWriter writer;
5119 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5120 writer.pos = s - starts;
5121
5122 Py_ssize_t startinpos, endinpos;
5123 const char *errmsg = "";
5124 PyObject *error_handler_obj = NULL;
5125 PyObject *exc = NULL;
5126
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005127 while (s < end) {
5128 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005129 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005130
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005132 if (PyUnicode_IS_ASCII(writer.buffer))
5133 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005134 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005135 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005137 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 } else {
5139 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005140 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 }
5142
5143 switch (ch) {
5144 case 0:
5145 if (s == end || consumed)
5146 goto End;
5147 errmsg = "unexpected end of data";
5148 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005149 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005150 break;
5151 case 1:
5152 errmsg = "invalid start byte";
5153 startinpos = s - starts;
5154 endinpos = startinpos + 1;
5155 break;
5156 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005157 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5158 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5159 {
5160 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005161 goto End;
5162 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005163 /* fall through */
5164 case 3:
5165 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005166 errmsg = "invalid continuation byte";
5167 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005168 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005169 break;
5170 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005171 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005172 goto onError;
5173 continue;
5174 }
5175
Victor Stinner1d65d912015-10-05 13:43:50 +02005176 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005177 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005178
5179 switch (error_handler) {
5180 case _Py_ERROR_IGNORE:
5181 s += (endinpos - startinpos);
5182 break;
5183
5184 case _Py_ERROR_REPLACE:
5185 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5186 goto onError;
5187 s += (endinpos - startinpos);
5188 break;
5189
5190 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005191 {
5192 Py_ssize_t i;
5193
Victor Stinner1d65d912015-10-05 13:43:50 +02005194 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5195 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005196 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005197 ch = (Py_UCS4)(unsigned char)(starts[i]);
5198 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5199 ch + 0xdc00);
5200 writer.pos++;
5201 }
5202 s += (endinpos - startinpos);
5203 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005204 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005205
5206 default:
5207 if (unicode_decode_call_errorhandler_writer(
5208 errors, &error_handler_obj,
5209 "utf-8", errmsg,
5210 &starts, &end, &startinpos, &endinpos, &exc, &s,
5211 &writer))
5212 goto onError;
5213 }
Victor Stinner785938e2011-12-11 20:09:03 +01005214 }
5215
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005216End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005217 if (consumed)
5218 *consumed = s - starts;
5219
Victor Stinner1d65d912015-10-05 13:43:50 +02005220 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005221 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005222 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005223
5224onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005225 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005226 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005227 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005228 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005229}
5230
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005231
Victor Stinner709d23d2019-05-02 14:56:30 -04005232PyObject *
5233PyUnicode_DecodeUTF8Stateful(const char *s,
5234 Py_ssize_t size,
5235 const char *errors,
5236 Py_ssize_t *consumed)
5237{
5238 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5239}
5240
5241
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005242/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5243 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005244
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245 On success, write a pointer to a newly allocated wide character string into
5246 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5247 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005248
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005249 On memory allocation failure, return -1.
5250
5251 On decoding error (if surrogateescape is zero), return -2. If wlen is
5252 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5253 is not NULL, write the decoding error message into *reason. */
5254int
5255_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005256 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005257{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005258 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005259 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005260 wchar_t *unicode;
5261 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005262
Victor Stinner3d4226a2018-08-29 22:21:32 +02005263 int surrogateescape = 0;
5264 int surrogatepass = 0;
5265 switch (errors)
5266 {
5267 case _Py_ERROR_STRICT:
5268 break;
5269 case _Py_ERROR_SURROGATEESCAPE:
5270 surrogateescape = 1;
5271 break;
5272 case _Py_ERROR_SURROGATEPASS:
5273 surrogatepass = 1;
5274 break;
5275 default:
5276 return -3;
5277 }
5278
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005279 /* Note: size will always be longer than the resulting Unicode
5280 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005281 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005282 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005283 }
5284
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005285 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005286 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005287 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005288 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005289
5290 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005291 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005292 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005293 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005294 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005295#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005296 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005297#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005298 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005299#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005300 if (ch > 0xFF) {
5301#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005302 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005303#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005304 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005305 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005306 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5307 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5308#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005309 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005310 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005311 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005312 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005313 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005314
5315 if (surrogateescape) {
5316 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5317 }
5318 else {
5319 /* Is it a valid three-byte code? */
5320 if (surrogatepass
5321 && (e - s) >= 3
5322 && (s[0] & 0xf0) == 0xe0
5323 && (s[1] & 0xc0) == 0x80
5324 && (s[2] & 0xc0) == 0x80)
5325 {
5326 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5327 s += 3;
5328 unicode[outpos++] = ch;
5329 }
5330 else {
5331 PyMem_RawFree(unicode );
5332 if (reason != NULL) {
5333 switch (ch) {
5334 case 0:
5335 *reason = "unexpected end of data";
5336 break;
5337 case 1:
5338 *reason = "invalid start byte";
5339 break;
5340 /* 2, 3, 4 */
5341 default:
5342 *reason = "invalid continuation byte";
5343 break;
5344 }
5345 }
5346 if (wlen != NULL) {
5347 *wlen = s - orig_s;
5348 }
5349 return -2;
5350 }
5351 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005352 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005353 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005354 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005355 if (wlen) {
5356 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005357 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005358 *wstr = unicode;
5359 return 0;
5360}
5361
Victor Stinner5f9cf232019-03-19 01:46:25 +01005362
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005363wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005364_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5365 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005366{
5367 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005368 int res = _Py_DecodeUTF8Ex(arg, arglen,
5369 &wstr, wlen,
5370 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005371 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005372 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5373 assert(res != -3);
5374 if (wlen) {
5375 *wlen = (size_t)res;
5376 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005377 return NULL;
5378 }
5379 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005380}
5381
Antoine Pitrouab868312009-01-10 15:40:25 +00005382
Victor Stinnere47e6982017-12-21 15:45:16 +01005383/* UTF-8 encoder using the surrogateescape error handler .
5384
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005385 On success, return 0 and write the newly allocated character string (use
5386 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005387
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005388 On encoding failure, return -2 and write the position of the invalid
5389 surrogate character into *error_pos (if error_pos is set) and the decoding
5390 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005391
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005392 On memory allocation failure, return -1. */
5393int
5394_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005395 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005396{
5397 const Py_ssize_t max_char_size = 4;
5398 Py_ssize_t len = wcslen(text);
5399
5400 assert(len >= 0);
5401
Victor Stinner3d4226a2018-08-29 22:21:32 +02005402 int surrogateescape = 0;
5403 int surrogatepass = 0;
5404 switch (errors)
5405 {
5406 case _Py_ERROR_STRICT:
5407 break;
5408 case _Py_ERROR_SURROGATEESCAPE:
5409 surrogateescape = 1;
5410 break;
5411 case _Py_ERROR_SURROGATEPASS:
5412 surrogatepass = 1;
5413 break;
5414 default:
5415 return -3;
5416 }
5417
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005418 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5419 return -1;
5420 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005421 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005422 if (raw_malloc) {
5423 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005424 }
5425 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005426 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005427 }
5428 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005429 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005430 }
5431
5432 char *p = bytes;
5433 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005434 for (i = 0; i < len; ) {
5435 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005436 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005437 i++;
5438#if Py_UNICODE_SIZE == 2
5439 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5440 && i < len
5441 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5442 {
5443 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5444 i++;
5445 }
5446#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005447
5448 if (ch < 0x80) {
5449 /* Encode ASCII */
5450 *p++ = (char) ch;
5451
5452 }
5453 else if (ch < 0x0800) {
5454 /* Encode Latin-1 */
5455 *p++ = (char)(0xc0 | (ch >> 6));
5456 *p++ = (char)(0x80 | (ch & 0x3f));
5457 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005458 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005459 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005460 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005461 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005462 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005463 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005464 if (reason != NULL) {
5465 *reason = "encoding error";
5466 }
5467 if (raw_malloc) {
5468 PyMem_RawFree(bytes);
5469 }
5470 else {
5471 PyMem_Free(bytes);
5472 }
5473 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005474 }
5475 *p++ = (char)(ch & 0xff);
5476 }
5477 else if (ch < 0x10000) {
5478 *p++ = (char)(0xe0 | (ch >> 12));
5479 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5480 *p++ = (char)(0x80 | (ch & 0x3f));
5481 }
5482 else { /* ch >= 0x10000 */
5483 assert(ch <= MAX_UNICODE);
5484 /* Encode UCS4 Unicode ordinals */
5485 *p++ = (char)(0xf0 | (ch >> 18));
5486 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5487 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5488 *p++ = (char)(0x80 | (ch & 0x3f));
5489 }
5490 }
5491 *p++ = '\0';
5492
5493 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005494 char *bytes2;
5495 if (raw_malloc) {
5496 bytes2 = PyMem_RawRealloc(bytes, final_size);
5497 }
5498 else {
5499 bytes2 = PyMem_Realloc(bytes, final_size);
5500 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005501 if (bytes2 == NULL) {
5502 if (error_pos != NULL) {
5503 *error_pos = (size_t)-1;
5504 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005505 if (raw_malloc) {
5506 PyMem_RawFree(bytes);
5507 }
5508 else {
5509 PyMem_Free(bytes);
5510 }
5511 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005512 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005513 *str = bytes2;
5514 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005515}
5516
5517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518/* Primary internal function which creates utf8 encoded bytes objects.
5519
5520 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005521 and allocate exactly as much space needed at the end. Else allocate the
5522 maximum possible needed (4 result bytes per Unicode character), and return
5523 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005524*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005525static PyObject *
5526unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5527 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 if (!PyUnicode_Check(unicode)) {
5530 PyErr_BadArgument();
5531 return NULL;
5532 }
5533
5534 if (PyUnicode_READY(unicode) == -1)
5535 return NULL;
5536
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005537 if (PyUnicode_UTF8(unicode))
5538 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5539 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540
Inada Naoki02a4d572020-02-27 13:48:59 +09005541 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005542 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005543 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5544
5545 _PyBytesWriter writer;
5546 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547
Benjamin Petersonead6b532011-12-20 17:23:42 -06005548 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005549 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005550 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005551 case PyUnicode_1BYTE_KIND:
5552 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5553 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005554 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5555 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005556 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005557 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5558 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005559 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005560 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5561 break;
Tim Peters602f7402002-04-27 18:03:26 +00005562 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005563
5564 if (end == NULL) {
5565 _PyBytesWriter_Dealloc(&writer);
5566 return NULL;
5567 }
5568 return _PyBytesWriter_Finish(&writer, end);
5569}
5570
5571static int
5572unicode_fill_utf8(PyObject *unicode)
5573{
5574 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5575 assert(!PyUnicode_IS_ASCII(unicode));
5576
5577 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005578 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005579 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5580
5581 _PyBytesWriter writer;
5582 char *end;
5583
5584 switch (kind) {
5585 default:
5586 Py_UNREACHABLE();
5587 case PyUnicode_1BYTE_KIND:
5588 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5589 _Py_ERROR_STRICT, NULL);
5590 break;
5591 case PyUnicode_2BYTE_KIND:
5592 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5593 _Py_ERROR_STRICT, NULL);
5594 break;
5595 case PyUnicode_4BYTE_KIND:
5596 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5597 _Py_ERROR_STRICT, NULL);
5598 break;
5599 }
5600 if (end == NULL) {
5601 _PyBytesWriter_Dealloc(&writer);
5602 return -1;
5603 }
5604
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005605 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005606 PyBytes_AS_STRING(writer.buffer);
5607 Py_ssize_t len = end - start;
5608
5609 char *cache = PyObject_MALLOC(len + 1);
5610 if (cache == NULL) {
5611 _PyBytesWriter_Dealloc(&writer);
5612 PyErr_NoMemory();
5613 return -1;
5614 }
5615 _PyUnicode_UTF8(unicode) = cache;
5616 _PyUnicode_UTF8_LENGTH(unicode) = len;
5617 memcpy(cache, start, len);
5618 cache[len] = '\0';
5619 _PyBytesWriter_Dealloc(&writer);
5620 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621}
5622
Alexander Belopolsky40018472011-02-26 01:02:56 +00005623PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005624_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5625{
5626 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5627}
5628
5629
5630PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5632 Py_ssize_t size,
5633 const char *errors)
5634{
5635 PyObject *v, *unicode;
5636
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005637 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 if (unicode == NULL)
5639 return NULL;
5640 v = _PyUnicode_AsUTF8String(unicode, errors);
5641 Py_DECREF(unicode);
5642 return v;
5643}
5644
5645PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005646PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649}
5650
Walter Dörwald41980ca2007-08-16 21:55:45 +00005651/* --- UTF-32 Codec ------------------------------------------------------- */
5652
5653PyObject *
5654PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 Py_ssize_t size,
5656 const char *errors,
5657 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005658{
5659 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5660}
5661
5662PyObject *
5663PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 Py_ssize_t size,
5665 const char *errors,
5666 int *byteorder,
5667 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005668{
5669 const char *starts = s;
5670 Py_ssize_t startinpos;
5671 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005672 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005673 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005674 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005675 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005676 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005677 PyObject *errorHandler = NULL;
5678 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005679
Andy Lestere6be9b52020-02-11 20:28:35 -06005680 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005681 e = q + size;
5682
5683 if (byteorder)
5684 bo = *byteorder;
5685
5686 /* Check for BOM marks (U+FEFF) in the input and adjust current
5687 byte order setting accordingly. In native mode, the leading BOM
5688 mark is skipped, in all other modes, it is copied to the output
5689 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005690 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005691 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005692 if (bom == 0x0000FEFF) {
5693 bo = -1;
5694 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005696 else if (bom == 0xFFFE0000) {
5697 bo = 1;
5698 q += 4;
5699 }
5700 if (byteorder)
5701 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005702 }
5703
Victor Stinnere64322e2012-10-30 23:12:47 +01005704 if (q == e) {
5705 if (consumed)
5706 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005707 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005708 }
5709
Victor Stinnere64322e2012-10-30 23:12:47 +01005710#ifdef WORDS_BIGENDIAN
5711 le = bo < 0;
5712#else
5713 le = bo <= 0;
5714#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005716
Victor Stinner8f674cc2013-04-17 23:02:17 +02005717 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005718 writer.min_length = (e - q + 3) / 4;
5719 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005721
Victor Stinnere64322e2012-10-30 23:12:47 +01005722 while (1) {
5723 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005724 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005725
Victor Stinnere64322e2012-10-30 23:12:47 +01005726 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005727 enum PyUnicode_Kind kind = writer.kind;
5728 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005729 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005731 if (le) {
5732 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005733 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005734 if (ch > maxch)
5735 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 if (kind != PyUnicode_1BYTE_KIND &&
5737 Py_UNICODE_IS_SURROGATE(ch))
5738 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005739 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005740 q += 4;
5741 } while (q <= last);
5742 }
5743 else {
5744 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005745 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005746 if (ch > maxch)
5747 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005748 if (kind != PyUnicode_1BYTE_KIND &&
5749 Py_UNICODE_IS_SURROGATE(ch))
5750 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005751 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005752 q += 4;
5753 } while (q <= last);
5754 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005755 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005756 }
5757
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005759 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 startinpos = ((const char *)q) - starts;
5761 endinpos = startinpos + 4;
5762 }
5763 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005764 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005766 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005768 startinpos = ((const char *)q) - starts;
5769 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005771 else {
5772 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005773 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005774 goto onError;
5775 q += 4;
5776 continue;
5777 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005778 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005779 startinpos = ((const char *)q) - starts;
5780 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005782
5783 /* The remaining input chars are ignored if the callback
5784 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005791 }
5792
Walter Dörwald41980ca2007-08-16 21:55:45 +00005793 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005795
Walter Dörwald41980ca2007-08-16 21:55:45 +00005796 Py_XDECREF(errorHandler);
5797 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005798 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005799
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005801 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005802 Py_XDECREF(errorHandler);
5803 Py_XDECREF(exc);
5804 return NULL;
5805}
5806
5807PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005808_PyUnicode_EncodeUTF32(PyObject *str,
5809 const char *errors,
5810 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005811{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005812 enum PyUnicode_Kind kind;
5813 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005814 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005815 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005816 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005817#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005818 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005819#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005820 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005821#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005822 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005823 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005824 PyObject *errorHandler = NULL;
5825 PyObject *exc = NULL;
5826 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005827
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005828 if (!PyUnicode_Check(str)) {
5829 PyErr_BadArgument();
5830 return NULL;
5831 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005832 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005833 return NULL;
5834 kind = PyUnicode_KIND(str);
5835 data = PyUnicode_DATA(str);
5836 len = PyUnicode_GET_LENGTH(str);
5837
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005838 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005839 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005840 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005841 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005842 if (v == NULL)
5843 return NULL;
5844
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005845 /* output buffer is 4-bytes aligned */
5846 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005847 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005848 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005849 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005851 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005852
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005853 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005854 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005855 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005856 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005857 else
5858 encoding = "utf-32";
5859
5860 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005861 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5862 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005863 }
5864
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005865 pos = 0;
5866 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005867 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005868
5869 if (kind == PyUnicode_2BYTE_KIND) {
5870 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5871 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005872 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005873 else {
5874 assert(kind == PyUnicode_4BYTE_KIND);
5875 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5876 &out, native_ordering);
5877 }
5878 if (pos == len)
5879 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005880
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005881 rep = unicode_encode_call_errorhandler(
5882 errors, &errorHandler,
5883 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005884 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005885 if (!rep)
5886 goto error;
5887
5888 if (PyBytes_Check(rep)) {
5889 repsize = PyBytes_GET_SIZE(rep);
5890 if (repsize & 3) {
5891 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005892 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005893 "surrogates not allowed");
5894 goto error;
5895 }
5896 moreunits = repsize / 4;
5897 }
5898 else {
5899 assert(PyUnicode_Check(rep));
5900 if (PyUnicode_READY(rep) < 0)
5901 goto error;
5902 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5903 if (!PyUnicode_IS_ASCII(rep)) {
5904 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005905 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005906 "surrogates not allowed");
5907 goto error;
5908 }
5909 }
5910
5911 /* four bytes are reserved for each surrogate */
5912 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005913 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005914 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005915 /* integer overflow */
5916 PyErr_NoMemory();
5917 goto error;
5918 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005919 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005920 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005921 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005922 }
5923
5924 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005925 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005926 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005927 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005928 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005929 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5930 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005931 }
5932
5933 Py_CLEAR(rep);
5934 }
5935
5936 /* Cut back to size actually needed. This is necessary for, for example,
5937 encoding of a string containing isolated surrogates and the 'ignore'
5938 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005939 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005940 if (nsize != PyBytes_GET_SIZE(v))
5941 _PyBytes_Resize(&v, nsize);
5942 Py_XDECREF(errorHandler);
5943 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005944 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005945 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005946 error:
5947 Py_XDECREF(rep);
5948 Py_XDECREF(errorHandler);
5949 Py_XDECREF(exc);
5950 Py_XDECREF(v);
5951 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005952}
5953
Alexander Belopolsky40018472011-02-26 01:02:56 +00005954PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5956 Py_ssize_t size,
5957 const char *errors,
5958 int byteorder)
5959{
5960 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005961 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005962 if (tmp == NULL)
5963 return NULL;
5964 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5965 Py_DECREF(tmp);
5966 return result;
5967}
5968
5969PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005970PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005971{
Victor Stinnerb960b342011-11-20 19:12:52 +01005972 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005973}
5974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975/* --- UTF-16 Codec ------------------------------------------------------- */
5976
Tim Peters772747b2001-08-09 22:21:55 +00005977PyObject *
5978PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 Py_ssize_t size,
5980 const char *errors,
5981 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982{
Walter Dörwald69652032004-09-07 20:24:22 +00005983 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5984}
5985
5986PyObject *
5987PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 Py_ssize_t size,
5989 const char *errors,
5990 int *byteorder,
5991 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005992{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005993 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005994 Py_ssize_t startinpos;
5995 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005996 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005997 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005998 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005999 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006000 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006001 PyObject *errorHandler = NULL;
6002 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006003 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004
Andy Lestere6be9b52020-02-11 20:28:35 -06006005 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006006 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
6008 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006009 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006011 /* Check for BOM marks (U+FEFF) in the input and adjust current
6012 byte order setting accordingly. In native mode, the leading BOM
6013 mark is skipped, in all other modes, it is copied to the output
6014 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006015 if (bo == 0 && size >= 2) {
6016 const Py_UCS4 bom = (q[1] << 8) | q[0];
6017 if (bom == 0xFEFF) {
6018 q += 2;
6019 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006021 else if (bom == 0xFFFE) {
6022 q += 2;
6023 bo = 1;
6024 }
6025 if (byteorder)
6026 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028
Antoine Pitrou63065d72012-05-15 23:48:04 +02006029 if (q == e) {
6030 if (consumed)
6031 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006032 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006033 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006034
Christian Heimes743e0cd2012-10-17 23:52:17 +02006035#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006036 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006037 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006038#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006039 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006040 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006041#endif
Tim Peters772747b2001-08-09 22:21:55 +00006042
Antoine Pitrou63065d72012-05-15 23:48:04 +02006043 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006044 character count normally. Error handler will take care of
6045 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006046 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006047 writer.min_length = (e - q + 1) / 2;
6048 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006049 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006050
Antoine Pitrou63065d72012-05-15 23:48:04 +02006051 while (1) {
6052 Py_UCS4 ch = 0;
6053 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006054 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006055 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006056 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006057 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006059 native_ordering);
6060 else
6061 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006062 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006063 native_ordering);
6064 } else if (kind == PyUnicode_2BYTE_KIND) {
6065 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006066 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006067 native_ordering);
6068 } else {
6069 assert(kind == PyUnicode_4BYTE_KIND);
6070 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006071 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006072 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006073 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006074 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075
Antoine Pitrou63065d72012-05-15 23:48:04 +02006076 switch (ch)
6077 {
6078 case 0:
6079 /* remaining byte at the end? (size should be even) */
6080 if (q == e || consumed)
6081 goto End;
6082 errmsg = "truncated data";
6083 startinpos = ((const char *)q) - starts;
6084 endinpos = ((const char *)e) - starts;
6085 break;
6086 /* The remaining input chars are ignored if the callback
6087 chooses to skip the input */
6088 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006089 q -= 2;
6090 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006091 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006092 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006093 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006094 endinpos = ((const char *)e) - starts;
6095 break;
6096 case 2:
6097 errmsg = "illegal encoding";
6098 startinpos = ((const char *)q) - 2 - starts;
6099 endinpos = startinpos + 2;
6100 break;
6101 case 3:
6102 errmsg = "illegal UTF-16 surrogate";
6103 startinpos = ((const char *)q) - 4 - starts;
6104 endinpos = startinpos + 2;
6105 break;
6106 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006107 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006108 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 continue;
6110 }
6111
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006112 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006113 errors,
6114 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006115 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006116 &starts,
6117 (const char **)&e,
6118 &startinpos,
6119 &endinpos,
6120 &exc,
6121 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006122 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
6125
Antoine Pitrou63065d72012-05-15 23:48:04 +02006126End:
Walter Dörwald69652032004-09-07 20:24:22 +00006127 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006129
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130 Py_XDECREF(errorHandler);
6131 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006132 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006135 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 Py_XDECREF(errorHandler);
6137 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 return NULL;
6139}
6140
Tim Peters772747b2001-08-09 22:21:55 +00006141PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142_PyUnicode_EncodeUTF16(PyObject *str,
6143 const char *errors,
6144 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006146 enum PyUnicode_Kind kind;
6147 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006149 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006150 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006151 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006152#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006153 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006154#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006155 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006156#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006157 const char *encoding;
6158 Py_ssize_t nsize, pos;
6159 PyObject *errorHandler = NULL;
6160 PyObject *exc = NULL;
6161 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006162
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 if (!PyUnicode_Check(str)) {
6164 PyErr_BadArgument();
6165 return NULL;
6166 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006167 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 return NULL;
6169 kind = PyUnicode_KIND(str);
6170 data = PyUnicode_DATA(str);
6171 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006174 if (kind == PyUnicode_4BYTE_KIND) {
6175 const Py_UCS4 *in = (const Py_UCS4 *)data;
6176 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006177 while (in < end) {
6178 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006180 }
6181 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006182 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006183 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006185 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006186 nsize = len + pairs + (byteorder == 0);
6187 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006188 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006192 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006193 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006194 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006195 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006196 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006197 }
6198 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006199 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006200 }
Tim Peters772747b2001-08-09 22:21:55 +00006201
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006202 if (kind == PyUnicode_1BYTE_KIND) {
6203 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6204 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006205 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006206
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006207 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006208 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006209 }
6210 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006211 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006212 }
6213 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006214 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006215 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006216
6217 pos = 0;
6218 while (pos < len) {
6219 Py_ssize_t repsize, moreunits;
6220
6221 if (kind == PyUnicode_2BYTE_KIND) {
6222 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6223 &out, native_ordering);
6224 }
6225 else {
6226 assert(kind == PyUnicode_4BYTE_KIND);
6227 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6228 &out, native_ordering);
6229 }
6230 if (pos == len)
6231 break;
6232
6233 rep = unicode_encode_call_errorhandler(
6234 errors, &errorHandler,
6235 encoding, "surrogates not allowed",
6236 str, &exc, pos, pos + 1, &pos);
6237 if (!rep)
6238 goto error;
6239
6240 if (PyBytes_Check(rep)) {
6241 repsize = PyBytes_GET_SIZE(rep);
6242 if (repsize & 1) {
6243 raise_encode_exception(&exc, encoding,
6244 str, pos - 1, pos,
6245 "surrogates not allowed");
6246 goto error;
6247 }
6248 moreunits = repsize / 2;
6249 }
6250 else {
6251 assert(PyUnicode_Check(rep));
6252 if (PyUnicode_READY(rep) < 0)
6253 goto error;
6254 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6255 if (!PyUnicode_IS_ASCII(rep)) {
6256 raise_encode_exception(&exc, encoding,
6257 str, pos - 1, pos,
6258 "surrogates not allowed");
6259 goto error;
6260 }
6261 }
6262
6263 /* two bytes are reserved for each surrogate */
6264 if (moreunits > 1) {
6265 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006266 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006267 /* integer overflow */
6268 PyErr_NoMemory();
6269 goto error;
6270 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006271 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006272 goto error;
6273 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6274 }
6275
6276 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006277 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006278 out += moreunits;
6279 } else /* rep is unicode */ {
6280 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6281 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6282 &out, native_ordering);
6283 }
6284
6285 Py_CLEAR(rep);
6286 }
6287
6288 /* Cut back to size actually needed. This is necessary for, for example,
6289 encoding of a string containing isolated surrogates and the 'ignore' handler
6290 is used. */
6291 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6292 if (nsize != PyBytes_GET_SIZE(v))
6293 _PyBytes_Resize(&v, nsize);
6294 Py_XDECREF(errorHandler);
6295 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006296 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006297 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006298 error:
6299 Py_XDECREF(rep);
6300 Py_XDECREF(errorHandler);
6301 Py_XDECREF(exc);
6302 Py_XDECREF(v);
6303 return NULL;
6304#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305}
6306
Alexander Belopolsky40018472011-02-26 01:02:56 +00006307PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006308PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6309 Py_ssize_t size,
6310 const char *errors,
6311 int byteorder)
6312{
6313 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006314 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006315 if (tmp == NULL)
6316 return NULL;
6317 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6318 Py_DECREF(tmp);
6319 return result;
6320}
6321
6322PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006323PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326}
6327
6328/* --- Unicode Escape Codec ----------------------------------------------- */
6329
Fredrik Lundh06d12682001-01-24 07:59:11 +00006330static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006331
Alexander Belopolsky40018472011-02-26 01:02:56 +00006332PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006333_PyUnicode_DecodeUnicodeEscape(const char *s,
6334 Py_ssize_t size,
6335 const char *errors,
6336 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006339 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006341 PyObject *errorHandler = NULL;
6342 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006343
Eric V. Smith42454af2016-10-31 09:22:08 -04006344 // so we can remember if we've seen an invalid escape char or not
6345 *first_invalid_escape = NULL;
6346
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006348 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 }
6350 /* Escaped strings will always be longer than the resulting
6351 Unicode string, so we start with size here and then reduce the
6352 length after conversion to the true value.
6353 (but if the error callback returns a long replacement string
6354 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006355 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006356 writer.min_length = size;
6357 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6358 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006359 }
6360
Guido van Rossumd57fd912000-03-10 22:53:23 +00006361 end = s + size;
6362 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006363 unsigned char c = (unsigned char) *s++;
6364 Py_UCS4 ch;
6365 int count;
6366 Py_ssize_t startinpos;
6367 Py_ssize_t endinpos;
6368 const char *message;
6369
6370#define WRITE_ASCII_CHAR(ch) \
6371 do { \
6372 assert(ch <= 127); \
6373 assert(writer.pos < writer.size); \
6374 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6375 } while(0)
6376
6377#define WRITE_CHAR(ch) \
6378 do { \
6379 if (ch <= writer.maxchar) { \
6380 assert(writer.pos < writer.size); \
6381 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6382 } \
6383 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6384 goto onError; \
6385 } \
6386 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
6388 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006389 if (c != '\\') {
6390 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 continue;
6392 }
6393
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 if (s >= end) {
6397 message = "\\ at end of string";
6398 goto error;
6399 }
6400 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006401
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006403 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 case '\n': continue;
6407 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6408 case '\'': WRITE_ASCII_CHAR('\''); continue;
6409 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6410 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006411 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6413 case 't': WRITE_ASCII_CHAR('\t'); continue;
6414 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6415 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006416 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 case '0': case '1': case '2': case '3':
6423 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006425 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 ch = (ch<<3) + *s++ - '0';
6427 if (s < end && '0' <= *s && *s <= '7') {
6428 ch = (ch<<3) + *s++ - '0';
6429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 WRITE_CHAR(ch);
6432 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 /* hex escapes */
6435 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006438 message = "truncated \\xXX escape";
6439 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006444 message = "truncated \\uXXXX escape";
6445 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006448 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006450 message = "truncated \\UXXXXXXXX escape";
6451 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006452 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006453 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 ch <<= 4;
6455 if (c >= '0' && c <= '9') {
6456 ch += c - '0';
6457 }
6458 else if (c >= 'a' && c <= 'f') {
6459 ch += c - ('a' - 10);
6460 }
6461 else if (c >= 'A' && c <= 'F') {
6462 ch += c - ('A' - 10);
6463 }
6464 else {
6465 break;
6466 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006467 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006469 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006470 }
6471
6472 /* when we get here, ch is a 32-bit unicode character */
6473 if (ch > MAX_UNICODE) {
6474 message = "illegal Unicode character";
6475 goto error;
6476 }
6477
6478 WRITE_CHAR(ch);
6479 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006482 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006483 if (ucnhash_CAPI == NULL) {
6484 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006485 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6486 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 if (ucnhash_CAPI == NULL) {
6488 PyErr_SetString(
6489 PyExc_UnicodeError,
6490 "\\N escapes not supported (can't load unicodedata module)"
6491 );
6492 goto onError;
6493 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006494 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006495
6496 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006497 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006498 const char *start = ++s;
6499 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006500 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006501 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006502 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006503 namelen = s - start;
6504 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006505 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006506 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006507 ch = 0xffffffff; /* in case 'getcode' messes up */
6508 if (namelen <= INT_MAX &&
6509 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6510 &ch, 0)) {
6511 assert(ch <= MAX_UNICODE);
6512 WRITE_CHAR(ch);
6513 continue;
6514 }
6515 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006516 }
6517 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006518 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006519
6520 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006521 if (*first_invalid_escape == NULL) {
6522 *first_invalid_escape = s-1; /* Back up one char, since we've
6523 already incremented s. */
6524 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 WRITE_ASCII_CHAR('\\');
6526 WRITE_CHAR(c);
6527 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006529
6530 error:
6531 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006532 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006533 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006534 errors, &errorHandler,
6535 "unicodeescape", message,
6536 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006537 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006538 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006539 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006540 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006541
6542#undef WRITE_ASCII_CHAR
6543#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006545
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006546 Py_XDECREF(errorHandler);
6547 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006548 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006549
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006551 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 Py_XDECREF(errorHandler);
6553 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 return NULL;
6555}
6556
Eric V. Smith42454af2016-10-31 09:22:08 -04006557PyObject *
6558PyUnicode_DecodeUnicodeEscape(const char *s,
6559 Py_ssize_t size,
6560 const char *errors)
6561{
6562 const char *first_invalid_escape;
6563 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6564 &first_invalid_escape);
6565 if (result == NULL)
6566 return NULL;
6567 if (first_invalid_escape != NULL) {
6568 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6569 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006570 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006571 Py_DECREF(result);
6572 return NULL;
6573 }
6574 }
6575 return result;
6576}
6577
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006578/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
Alexander Belopolsky40018472011-02-26 01:02:56 +00006580PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006581PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006583 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006584 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006586 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006587 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
Ezio Melottie7f90372012-10-05 03:33:31 +03006590 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006591 escape.
6592
Ezio Melottie7f90372012-10-05 03:33:31 +03006593 For UCS1 strings it's '\xxx', 4 bytes per source character.
6594 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6595 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006596 */
6597
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006598 if (!PyUnicode_Check(unicode)) {
6599 PyErr_BadArgument();
6600 return NULL;
6601 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006602 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006603 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006604 }
Victor Stinner358af132015-10-12 22:36:57 +02006605
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006606 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006607 if (len == 0) {
6608 return PyBytes_FromStringAndSize(NULL, 0);
6609 }
6610
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006611 kind = PyUnicode_KIND(unicode);
6612 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6614 bytes, and 1 byte characters 4. */
6615 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006616 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006617 return PyErr_NoMemory();
6618 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006619 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006620 if (repr == NULL) {
6621 return NULL;
6622 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006623
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006625 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006626 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006627
Victor Stinner62ec3312016-09-06 17:04:34 -07006628 /* U+0000-U+00ff range */
6629 if (ch < 0x100) {
6630 if (ch >= ' ' && ch < 127) {
6631 if (ch != '\\') {
6632 /* Copy printable US ASCII as-is */
6633 *p++ = (char) ch;
6634 }
6635 /* Escape backslashes */
6636 else {
6637 *p++ = '\\';
6638 *p++ = '\\';
6639 }
6640 }
Victor Stinner358af132015-10-12 22:36:57 +02006641
Victor Stinner62ec3312016-09-06 17:04:34 -07006642 /* Map special whitespace to '\t', \n', '\r' */
6643 else if (ch == '\t') {
6644 *p++ = '\\';
6645 *p++ = 't';
6646 }
6647 else if (ch == '\n') {
6648 *p++ = '\\';
6649 *p++ = 'n';
6650 }
6651 else if (ch == '\r') {
6652 *p++ = '\\';
6653 *p++ = 'r';
6654 }
6655
6656 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6657 else {
6658 *p++ = '\\';
6659 *p++ = 'x';
6660 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6661 *p++ = Py_hexdigits[ch & 0x000F];
6662 }
Tim Petersced69f82003-09-16 20:30:58 +00006663 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006664 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006665 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 *p++ = '\\';
6667 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006668 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6669 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6670 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6671 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006673 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6674 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006675
Victor Stinner62ec3312016-09-06 17:04:34 -07006676 /* Make sure that the first two digits are zero */
6677 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006678 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006679 *p++ = 'U';
6680 *p++ = '0';
6681 *p++ = '0';
6682 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6683 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6684 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6685 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6686 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6687 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Victor Stinner62ec3312016-09-06 17:04:34 -07006691 assert(p - PyBytes_AS_STRING(repr) > 0);
6692 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6693 return NULL;
6694 }
6695 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696}
6697
Alexander Belopolsky40018472011-02-26 01:02:56 +00006698PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006699PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6700 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006702 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006703 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006704 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006706 }
6707
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006708 result = PyUnicode_AsUnicodeEscapeString(tmp);
6709 Py_DECREF(tmp);
6710 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711}
6712
6713/* --- Raw Unicode Escape Codec ------------------------------------------- */
6714
Alexander Belopolsky40018472011-02-26 01:02:56 +00006715PyObject *
6716PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006717 Py_ssize_t size,
6718 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006720 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006721 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 PyObject *errorHandler = NULL;
6724 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006725
Victor Stinner62ec3312016-09-06 17:04:34 -07006726 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006727 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006728 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006729
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 /* Escaped strings will always be longer than the resulting
6731 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 length after conversion to the true value. (But decoding error
6733 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006734 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006735 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006736 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6737 goto onError;
6738 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740 end = s + size;
6741 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006742 unsigned char c = (unsigned char) *s++;
6743 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006744 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006745 Py_ssize_t startinpos;
6746 Py_ssize_t endinpos;
6747 const char *message;
6748
6749#define WRITE_CHAR(ch) \
6750 do { \
6751 if (ch <= writer.maxchar) { \
6752 assert(writer.pos < writer.size); \
6753 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6754 } \
6755 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6756 goto onError; \
6757 } \
6758 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006761 if (c != '\\' || s >= end) {
6762 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006764 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006765
Victor Stinner62ec3312016-09-06 17:04:34 -07006766 c = (unsigned char) *s++;
6767 if (c == 'u') {
6768 count = 4;
6769 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006771 else if (c == 'U') {
6772 count = 8;
6773 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006774 }
6775 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006776 assert(writer.pos < writer.size);
6777 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6778 WRITE_CHAR(c);
6779 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006780 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006781 startinpos = s - starts - 2;
6782
6783 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6784 for (ch = 0; count && s < end; ++s, --count) {
6785 c = (unsigned char)*s;
6786 ch <<= 4;
6787 if (c >= '0' && c <= '9') {
6788 ch += c - '0';
6789 }
6790 else if (c >= 'a' && c <= 'f') {
6791 ch += c - ('a' - 10);
6792 }
6793 else if (c >= 'A' && c <= 'F') {
6794 ch += c - ('A' - 10);
6795 }
6796 else {
6797 break;
6798 }
6799 }
6800 if (!count) {
6801 if (ch <= MAX_UNICODE) {
6802 WRITE_CHAR(ch);
6803 continue;
6804 }
6805 message = "\\Uxxxxxxxx out of range";
6806 }
6807
6808 endinpos = s-starts;
6809 writer.min_length = end - s + writer.pos;
6810 if (unicode_decode_call_errorhandler_writer(
6811 errors, &errorHandler,
6812 "rawunicodeescape", message,
6813 &starts, &end, &startinpos, &endinpos, &exc, &s,
6814 &writer)) {
6815 goto onError;
6816 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006817 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006818
6819#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006821 Py_XDECREF(errorHandler);
6822 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006823 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006824
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006826 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006827 Py_XDECREF(errorHandler);
6828 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006830
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831}
6832
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006833
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006835PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
Victor Stinner62ec3312016-09-06 17:04:34 -07006837 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006839 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006840 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006841 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006842 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006844 if (!PyUnicode_Check(unicode)) {
6845 PyErr_BadArgument();
6846 return NULL;
6847 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006848 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006849 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006850 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006851 kind = PyUnicode_KIND(unicode);
6852 data = PyUnicode_DATA(unicode);
6853 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006854 if (kind == PyUnicode_1BYTE_KIND) {
6855 return PyBytes_FromStringAndSize(data, len);
6856 }
Victor Stinner0e368262011-11-10 20:12:49 +01006857
Victor Stinner62ec3312016-09-06 17:04:34 -07006858 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6859 bytes, and 1 byte characters 4. */
6860 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006861
Victor Stinner62ec3312016-09-06 17:04:34 -07006862 if (len > PY_SSIZE_T_MAX / expandsize) {
6863 return PyErr_NoMemory();
6864 }
6865 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6866 if (repr == NULL) {
6867 return NULL;
6868 }
6869 if (len == 0) {
6870 return repr;
6871 }
6872
6873 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006874 for (pos = 0; pos < len; pos++) {
6875 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006876
Victor Stinner62ec3312016-09-06 17:04:34 -07006877 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6878 if (ch < 0x100) {
6879 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006880 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006881 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006882 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 *p++ = '\\';
6884 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006885 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6886 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6887 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6888 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006890 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6891 else {
6892 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6893 *p++ = '\\';
6894 *p++ = 'U';
6895 *p++ = '0';
6896 *p++ = '0';
6897 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6898 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6899 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6900 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6901 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6902 *p++ = Py_hexdigits[ch & 15];
6903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006905
Victor Stinner62ec3312016-09-06 17:04:34 -07006906 assert(p > PyBytes_AS_STRING(repr));
6907 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6908 return NULL;
6909 }
6910 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911}
6912
Alexander Belopolsky40018472011-02-26 01:02:56 +00006913PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006914PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6915 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006917 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006918 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006919 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006920 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006921 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6922 Py_DECREF(tmp);
6923 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924}
6925
6926/* --- Latin-1 Codec ------------------------------------------------------ */
6927
Alexander Belopolsky40018472011-02-26 01:02:56 +00006928PyObject *
6929PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006930 Py_ssize_t size,
6931 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006934 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935}
6936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006937/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006938static void
6939make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006940 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006941 PyObject *unicode,
6942 Py_ssize_t startpos, Py_ssize_t endpos,
6943 const char *reason)
6944{
6945 if (*exceptionObject == NULL) {
6946 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006947 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006948 encoding, unicode, startpos, endpos, reason);
6949 }
6950 else {
6951 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6952 goto onError;
6953 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6954 goto onError;
6955 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6956 goto onError;
6957 return;
6958 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006959 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006960 }
6961}
6962
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006963/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006964static void
6965raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006966 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006967 PyObject *unicode,
6968 Py_ssize_t startpos, Py_ssize_t endpos,
6969 const char *reason)
6970{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006971 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006972 encoding, unicode, startpos, endpos, reason);
6973 if (*exceptionObject != NULL)
6974 PyCodec_StrictErrors(*exceptionObject);
6975}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006976
6977/* error handling callback helper:
6978 build arguments, call the callback and check the arguments,
6979 put the result into newpos and return the replacement string, which
6980 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006981static PyObject *
6982unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006983 PyObject **errorHandler,
6984 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006985 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006986 Py_ssize_t startpos, Py_ssize_t endpos,
6987 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006989 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006990 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006991 PyObject *restuple;
6992 PyObject *resunicode;
6993
6994 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006996 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 }
6999
Benjamin Petersonbac79492012-01-14 13:34:47 -05007000 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007001 return NULL;
7002 len = PyUnicode_GET_LENGTH(unicode);
7003
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007004 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007005 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007008
Petr Viktorinffd97532020-02-11 17:46:57 +01007009 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007012 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007013 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 Py_DECREF(restuple);
7015 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007017 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 &resunicode, newpos)) {
7019 Py_DECREF(restuple);
7020 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007022 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7023 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7024 Py_DECREF(restuple);
7025 return NULL;
7026 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007027 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028 *newpos = len + *newpos;
7029 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007030 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007031 Py_DECREF(restuple);
7032 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007033 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 Py_INCREF(resunicode);
7035 Py_DECREF(restuple);
7036 return resunicode;
7037}
7038
Alexander Belopolsky40018472011-02-26 01:02:56 +00007039static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007040unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007041 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007042 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007043{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007044 /* input state */
7045 Py_ssize_t pos=0, size;
7046 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007047 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 /* pointer into the output */
7049 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007050 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7051 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007052 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007054 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007055 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007056 /* output object */
7057 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007058
Benjamin Petersonbac79492012-01-14 13:34:47 -05007059 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007060 return NULL;
7061 size = PyUnicode_GET_LENGTH(unicode);
7062 kind = PyUnicode_KIND(unicode);
7063 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064 /* allocate enough for a simple encoding without
7065 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007066 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007067 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007068
7069 _PyBytesWriter_Init(&writer);
7070 str = _PyBytesWriter_Alloc(&writer, size);
7071 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007072 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007073
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007074 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007075 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007076
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007078 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007080 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007084 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007086 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007087 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007089
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007090 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007092
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007093 /* Only overallocate the buffer if it's not the last write */
7094 writer.overallocate = (collend < size);
7095
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007097 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007098 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007099
7100 switch (error_handler) {
7101 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007102 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007103 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007104
7105 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007106 memset(str, '?', collend - collstart);
7107 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007108 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007109 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007110 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 break;
Victor Stinner50149202015-09-22 00:26:54 +02007112
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007113 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007114 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007115 writer.min_size -= (collend - collstart);
7116 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007117 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007118 if (str == NULL)
7119 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007120 pos = collend;
7121 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007122
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007123 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007124 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007125 writer.min_size -= (collend - collstart);
7126 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007127 unicode, collstart, collend);
7128 if (str == NULL)
7129 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007130 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 break;
Victor Stinner50149202015-09-22 00:26:54 +02007132
Victor Stinnerc3713e92015-09-29 12:32:13 +02007133 case _Py_ERROR_SURROGATEESCAPE:
7134 for (i = collstart; i < collend; ++i) {
7135 ch = PyUnicode_READ(kind, data, i);
7136 if (ch < 0xdc80 || 0xdcff < ch) {
7137 /* Not a UTF-8b surrogate */
7138 break;
7139 }
7140 *str++ = (char)(ch - 0xdc00);
7141 ++pos;
7142 }
7143 if (i >= collend)
7144 break;
7145 collstart = pos;
7146 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007147 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007148
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007150 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7151 encoding, reason, unicode, &exc,
7152 collstart, collend, &newpos);
7153 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007155
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007156 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007157 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007158
Victor Stinner6bd525b2015-10-09 13:10:05 +02007159 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007160 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007161 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007162 PyBytes_AS_STRING(rep),
7163 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007164 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007165 else {
7166 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007167
Victor Stinner6bd525b2015-10-09 13:10:05 +02007168 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007170
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007171 if (limit == 256 ?
7172 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7173 !PyUnicode_IS_ASCII(rep))
7174 {
7175 /* Not all characters are smaller than limit */
7176 raise_encode_exception(&exc, encoding, unicode,
7177 collstart, collend, reason);
7178 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007180 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7181 str = _PyBytesWriter_WriteBytes(&writer, str,
7182 PyUnicode_DATA(rep),
7183 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007185 if (str == NULL)
7186 goto onError;
7187
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007188 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007189 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007190 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007191
7192 /* If overallocation was disabled, ensure that it was the last
7193 write. Otherwise, we missed an optimization */
7194 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007195 }
7196 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007197
Victor Stinner50149202015-09-22 00:26:54 +02007198 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007200 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007201
7202 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007203 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007204 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007205 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007206 Py_XDECREF(exc);
7207 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007208}
7209
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007210/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007211PyObject *
7212PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007213 Py_ssize_t size,
7214 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007216 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007217 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007218 if (unicode == NULL)
7219 return NULL;
7220 result = unicode_encode_ucs1(unicode, errors, 256);
7221 Py_DECREF(unicode);
7222 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223}
7224
Alexander Belopolsky40018472011-02-26 01:02:56 +00007225PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007226_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227{
7228 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 PyErr_BadArgument();
7230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007232 if (PyUnicode_READY(unicode) == -1)
7233 return NULL;
7234 /* Fast path: if it is a one-byte string, construct
7235 bytes object directly. */
7236 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7237 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7238 PyUnicode_GET_LENGTH(unicode));
7239 /* Non-Latin-1 characters present. Defer to above function to
7240 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007241 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007242}
7243
7244PyObject*
7245PyUnicode_AsLatin1String(PyObject *unicode)
7246{
7247 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248}
7249
7250/* --- 7-bit ASCII Codec -------------------------------------------------- */
7251
Alexander Belopolsky40018472011-02-26 01:02:56 +00007252PyObject *
7253PyUnicode_DecodeASCII(const char *s,
7254 Py_ssize_t size,
7255 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007258 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007259 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007261 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007262
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007264 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007265
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007267 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007268 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007270
Inada Naoki770847a2019-06-24 12:30:24 +09007271 // Shortcut for simple case
7272 PyObject *u = PyUnicode_New(size, 127);
7273 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007274 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007275 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007276 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007277 if (outpos == size) {
7278 return u;
7279 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007280
Inada Naoki770847a2019-06-24 12:30:24 +09007281 _PyUnicodeWriter writer;
7282 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007283 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007284
Inada Naoki770847a2019-06-24 12:30:24 +09007285 s += outpos;
7286 int kind = writer.kind;
7287 void *data = writer.data;
7288 Py_ssize_t startinpos, endinpos;
7289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007290 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007291 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007293 PyUnicode_WRITE(kind, data, writer.pos, c);
7294 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007296 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007298
7299 /* byte outsize range 0x00..0x7f: call the error handler */
7300
7301 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007302 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007303
7304 switch (error_handler)
7305 {
7306 case _Py_ERROR_REPLACE:
7307 case _Py_ERROR_SURROGATEESCAPE:
7308 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007309 but we may switch to UCS2 at the first write */
7310 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7311 goto onError;
7312 kind = writer.kind;
7313 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007314
7315 if (error_handler == _Py_ERROR_REPLACE)
7316 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7317 else
7318 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7319 writer.pos++;
7320 ++s;
7321 break;
7322
7323 case _Py_ERROR_IGNORE:
7324 ++s;
7325 break;
7326
7327 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 startinpos = s-starts;
7329 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007330 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007331 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 "ascii", "ordinal not in range(128)",
7333 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007334 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007336 kind = writer.kind;
7337 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007340 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007341 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007342 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007343
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007345 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007346 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007347 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 return NULL;
7349}
7350
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007351/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007352PyObject *
7353PyUnicode_EncodeASCII(const Py_UNICODE *p,
7354 Py_ssize_t size,
7355 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007357 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007358 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007359 if (unicode == NULL)
7360 return NULL;
7361 result = unicode_encode_ucs1(unicode, errors, 128);
7362 Py_DECREF(unicode);
7363 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364}
7365
Alexander Belopolsky40018472011-02-26 01:02:56 +00007366PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007367_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368{
7369 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 PyErr_BadArgument();
7371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007373 if (PyUnicode_READY(unicode) == -1)
7374 return NULL;
7375 /* Fast path: if it is an ASCII-only string, construct bytes object
7376 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007377 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007378 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7379 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007380 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007381}
7382
7383PyObject *
7384PyUnicode_AsASCIIString(PyObject *unicode)
7385{
7386 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387}
7388
Steve Dowercc16be82016-09-08 10:35:16 -07007389#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007390
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007391/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007392
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007393#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007394#define NEED_RETRY
7395#endif
7396
Steve Dower7ebdda02019-08-21 16:22:33 -07007397/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7398 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7399 both cases also and avoids partial characters overrunning the
7400 length limit in MultiByteToWideChar on Windows */
7401#define DECODING_CHUNK_SIZE (INT_MAX/4)
7402
Victor Stinner3a50e702011-10-18 21:21:00 +02007403#ifndef WC_ERR_INVALID_CHARS
7404# define WC_ERR_INVALID_CHARS 0x0080
7405#endif
7406
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007407static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007408code_page_name(UINT code_page, PyObject **obj)
7409{
7410 *obj = NULL;
7411 if (code_page == CP_ACP)
7412 return "mbcs";
7413 if (code_page == CP_UTF7)
7414 return "CP_UTF7";
7415 if (code_page == CP_UTF8)
7416 return "CP_UTF8";
7417
7418 *obj = PyBytes_FromFormat("cp%u", code_page);
7419 if (*obj == NULL)
7420 return NULL;
7421 return PyBytes_AS_STRING(*obj);
7422}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424static DWORD
7425decode_code_page_flags(UINT code_page)
7426{
7427 if (code_page == CP_UTF7) {
7428 /* The CP_UTF7 decoder only supports flags=0 */
7429 return 0;
7430 }
7431 else
7432 return MB_ERR_INVALID_CHARS;
7433}
7434
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007435/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 * Decode a byte string from a Windows code page into unicode object in strict
7437 * mode.
7438 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007439 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7440 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007441 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007442static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007443decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007444 wchar_t **buf,
7445 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007446 const char *in,
7447 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007449 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007450 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007451 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452
7453 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007455 while ((outsize = MultiByteToWideChar(code_page, flags,
7456 in, insize, NULL, 0)) <= 0)
7457 {
7458 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7459 goto error;
7460 }
7461 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7462 flags = 0;
7463 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007465 /* Extend a wchar_t* buffer */
7466 Py_ssize_t n = *bufsize; /* Get the current length */
7467 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7468 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007470 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471
7472 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7474 if (outsize <= 0)
7475 goto error;
7476 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478error:
7479 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7480 return -2;
7481 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007482 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007483}
7484
Victor Stinner3a50e702011-10-18 21:21:00 +02007485/*
7486 * Decode a byte string from a code page into unicode object with an error
7487 * handler.
7488 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007489 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 * UnicodeDecodeError exception and returns -1 on error.
7491 */
7492static int
7493decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007494 wchar_t **buf,
7495 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007496 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007497 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007498{
7499 const char *startin = in;
7500 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007501 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 /* Ideally, we should get reason from FormatMessage. This is the Windows
7503 2000 English version of the message. */
7504 const char *reason = "No mapping for the Unicode character exists "
7505 "in the target code page.";
7506 /* each step cannot decode more than 1 character, but a character can be
7507 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007508 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007509 int insize;
7510 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 PyObject *errorHandler = NULL;
7512 PyObject *exc = NULL;
7513 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007514 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 DWORD err;
7516 int ret = -1;
7517
7518 assert(size > 0);
7519
7520 encoding = code_page_name(code_page, &encoding_obj);
7521 if (encoding == NULL)
7522 return -1;
7523
Victor Stinner7d00cc12014-03-17 23:08:06 +01007524 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7526 UnicodeDecodeError. */
7527 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7528 if (exc != NULL) {
7529 PyCodec_StrictErrors(exc);
7530 Py_CLEAR(exc);
7531 }
7532 goto error;
7533 }
7534
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007535 /* Extend a wchar_t* buffer */
7536 Py_ssize_t n = *bufsize; /* Get the current length */
7537 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7538 PyErr_NoMemory();
7539 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007541 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7542 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007544 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007545
7546 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 while (in < endin)
7548 {
7549 /* Decode a character */
7550 insize = 1;
7551 do
7552 {
7553 outsize = MultiByteToWideChar(code_page, flags,
7554 in, insize,
7555 buffer, Py_ARRAY_LENGTH(buffer));
7556 if (outsize > 0)
7557 break;
7558 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007559 if (err == ERROR_INVALID_FLAGS && flags) {
7560 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7561 flags = 0;
7562 continue;
7563 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 if (err != ERROR_NO_UNICODE_TRANSLATION
7565 && err != ERROR_INSUFFICIENT_BUFFER)
7566 {
7567 PyErr_SetFromWindowsErr(0);
7568 goto error;
7569 }
7570 insize++;
7571 }
7572 /* 4=maximum length of a UTF-8 sequence */
7573 while (insize <= 4 && (in + insize) <= endin);
7574
7575 if (outsize <= 0) {
7576 Py_ssize_t startinpos, endinpos, outpos;
7577
Victor Stinner7d00cc12014-03-17 23:08:06 +01007578 /* last character in partial decode? */
7579 if (in + insize >= endin && !final)
7580 break;
7581
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 startinpos = in - startin;
7583 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007584 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007585 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 errors, &errorHandler,
7587 encoding, reason,
7588 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007589 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 {
7591 goto error;
7592 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007593 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 }
7595 else {
7596 in += insize;
7597 memcpy(out, buffer, outsize * sizeof(wchar_t));
7598 out += outsize;
7599 }
7600 }
7601
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007602 /* Shrink the buffer */
7603 assert(out - *buf <= *bufsize);
7604 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007605 /* (in - startin) <= size and size is an int */
7606 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007607
7608error:
7609 Py_XDECREF(encoding_obj);
7610 Py_XDECREF(errorHandler);
7611 Py_XDECREF(exc);
7612 return ret;
7613}
7614
Victor Stinner3a50e702011-10-18 21:21:00 +02007615static PyObject *
7616decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007617 const char *s, Py_ssize_t size,
7618 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007619{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007620 wchar_t *buf = NULL;
7621 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007622 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007623
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 if (code_page < 0) {
7625 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7626 return NULL;
7627 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007628 if (size < 0) {
7629 PyErr_BadInternalCall();
7630 return NULL;
7631 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007632
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007633 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007635
Victor Stinner76a31a62011-11-04 00:05:13 +01007636 do
7637 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007638#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007639 if (size > DECODING_CHUNK_SIZE) {
7640 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007641 final = 0;
7642 done = 0;
7643 }
7644 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007646 {
7647 chunk_size = (int)size;
7648 final = (consumed == NULL);
7649 done = 1;
7650 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007651
Victor Stinner76a31a62011-11-04 00:05:13 +01007652 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007653 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007654 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007655 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007656 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007658 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007659 s, chunk_size);
7660 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007661 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007662 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007663 errors, final);
7664 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007665
7666 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007667 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007668 return NULL;
7669 }
7670
7671 if (consumed)
7672 *consumed += converted;
7673
7674 s += converted;
7675 size -= converted;
7676 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007677
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007678 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7679 PyMem_Free(buf);
7680 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007681}
7682
Alexander Belopolsky40018472011-02-26 01:02:56 +00007683PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007684PyUnicode_DecodeCodePageStateful(int code_page,
7685 const char *s,
7686 Py_ssize_t size,
7687 const char *errors,
7688 Py_ssize_t *consumed)
7689{
7690 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7691}
7692
7693PyObject *
7694PyUnicode_DecodeMBCSStateful(const char *s,
7695 Py_ssize_t size,
7696 const char *errors,
7697 Py_ssize_t *consumed)
7698{
7699 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7700}
7701
7702PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007703PyUnicode_DecodeMBCS(const char *s,
7704 Py_ssize_t size,
7705 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007706{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007707 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7708}
7709
Victor Stinner3a50e702011-10-18 21:21:00 +02007710static DWORD
7711encode_code_page_flags(UINT code_page, const char *errors)
7712{
7713 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007714 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 }
7716 else if (code_page == CP_UTF7) {
7717 /* CP_UTF7 only supports flags=0 */
7718 return 0;
7719 }
7720 else {
7721 if (errors != NULL && strcmp(errors, "replace") == 0)
7722 return 0;
7723 else
7724 return WC_NO_BEST_FIT_CHARS;
7725 }
7726}
7727
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007728/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 * Encode a Unicode string to a Windows code page into a byte string in strict
7730 * mode.
7731 *
7732 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007733 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007734 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007735static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007736encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007737 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007738 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007739{
Victor Stinner554f3f02010-06-16 23:33:54 +00007740 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007741 BOOL *pusedDefaultChar = &usedDefaultChar;
7742 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007743 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007744 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 const DWORD flags = encode_code_page_flags(code_page, NULL);
7746 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007747 /* Create a substring so that we can get the UTF-16 representation
7748 of just the slice under consideration. */
7749 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007750
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007752
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007754 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007755 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007756 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007757
Victor Stinner2fc507f2011-11-04 20:06:39 +01007758 substring = PyUnicode_Substring(unicode, offset, offset+len);
7759 if (substring == NULL)
7760 return -1;
7761 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7762 if (p == NULL) {
7763 Py_DECREF(substring);
7764 return -1;
7765 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007766 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007767
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007768 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007769 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007770 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007771 NULL, 0,
7772 NULL, pusedDefaultChar);
7773 if (outsize <= 0)
7774 goto error;
7775 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007776 if (pusedDefaultChar && *pusedDefaultChar) {
7777 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007778 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007779 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007780
Victor Stinner3a50e702011-10-18 21:21:00 +02007781 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007783 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007784 if (*outbytes == NULL) {
7785 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007787 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007788 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007789 }
7790 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007792 const Py_ssize_t n = PyBytes_Size(*outbytes);
7793 if (outsize > PY_SSIZE_T_MAX - n) {
7794 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007795 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007797 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007798 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7799 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007800 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007801 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007802 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007803 }
7804
7805 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007806 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007807 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007808 out, outsize,
7809 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007810 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007811 if (outsize <= 0)
7812 goto error;
7813 if (pusedDefaultChar && *pusedDefaultChar)
7814 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007815 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007816
Victor Stinner3a50e702011-10-18 21:21:00 +02007817error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007818 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007819 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7820 return -2;
7821 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007822 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007823}
7824
Victor Stinner3a50e702011-10-18 21:21:00 +02007825/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007826 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007827 * error handler.
7828 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007829 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007830 * -1 on other error.
7831 */
7832static int
7833encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007834 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007835 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007836{
Victor Stinner3a50e702011-10-18 21:21:00 +02007837 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007838 Py_ssize_t pos = unicode_offset;
7839 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007840 /* Ideally, we should get reason from FormatMessage. This is the Windows
7841 2000 English version of the message. */
7842 const char *reason = "invalid character";
7843 /* 4=maximum length of a UTF-8 sequence */
7844 char buffer[4];
7845 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7846 Py_ssize_t outsize;
7847 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007848 PyObject *errorHandler = NULL;
7849 PyObject *exc = NULL;
7850 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007851 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007852 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007853 PyObject *rep;
7854 int ret = -1;
7855
7856 assert(insize > 0);
7857
7858 encoding = code_page_name(code_page, &encoding_obj);
7859 if (encoding == NULL)
7860 return -1;
7861
7862 if (errors == NULL || strcmp(errors, "strict") == 0) {
7863 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7864 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007865 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007866 if (exc != NULL) {
7867 PyCodec_StrictErrors(exc);
7868 Py_DECREF(exc);
7869 }
7870 Py_XDECREF(encoding_obj);
7871 return -1;
7872 }
7873
7874 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7875 pusedDefaultChar = &usedDefaultChar;
7876 else
7877 pusedDefaultChar = NULL;
7878
7879 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7880 PyErr_NoMemory();
7881 goto error;
7882 }
7883 outsize = insize * Py_ARRAY_LENGTH(buffer);
7884
7885 if (*outbytes == NULL) {
7886 /* Create string object */
7887 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7888 if (*outbytes == NULL)
7889 goto error;
7890 out = PyBytes_AS_STRING(*outbytes);
7891 }
7892 else {
7893 /* Extend string object */
7894 Py_ssize_t n = PyBytes_Size(*outbytes);
7895 if (n > PY_SSIZE_T_MAX - outsize) {
7896 PyErr_NoMemory();
7897 goto error;
7898 }
7899 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7900 goto error;
7901 out = PyBytes_AS_STRING(*outbytes) + n;
7902 }
7903
7904 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007905 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007906 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007907 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7908 wchar_t chars[2];
7909 int charsize;
7910 if (ch < 0x10000) {
7911 chars[0] = (wchar_t)ch;
7912 charsize = 1;
7913 }
7914 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007915 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7916 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007917 charsize = 2;
7918 }
7919
Victor Stinner3a50e702011-10-18 21:21:00 +02007920 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007921 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007922 buffer, Py_ARRAY_LENGTH(buffer),
7923 NULL, pusedDefaultChar);
7924 if (outsize > 0) {
7925 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7926 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007927 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007928 memcpy(out, buffer, outsize);
7929 out += outsize;
7930 continue;
7931 }
7932 }
7933 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7934 PyErr_SetFromWindowsErr(0);
7935 goto error;
7936 }
7937
Victor Stinner3a50e702011-10-18 21:21:00 +02007938 rep = unicode_encode_call_errorhandler(
7939 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007940 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007941 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007942 if (rep == NULL)
7943 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007944 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007945
7946 if (PyBytes_Check(rep)) {
7947 outsize = PyBytes_GET_SIZE(rep);
7948 if (outsize != 1) {
7949 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7950 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7951 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7952 Py_DECREF(rep);
7953 goto error;
7954 }
7955 out = PyBytes_AS_STRING(*outbytes) + offset;
7956 }
7957 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7958 out += outsize;
7959 }
7960 else {
7961 Py_ssize_t i;
7962 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007963 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007964
Benjamin Petersonbac79492012-01-14 13:34:47 -05007965 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007966 Py_DECREF(rep);
7967 goto error;
7968 }
7969
7970 outsize = PyUnicode_GET_LENGTH(rep);
7971 if (outsize != 1) {
7972 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7973 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7974 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7975 Py_DECREF(rep);
7976 goto error;
7977 }
7978 out = PyBytes_AS_STRING(*outbytes) + offset;
7979 }
7980 kind = PyUnicode_KIND(rep);
7981 data = PyUnicode_DATA(rep);
7982 for (i=0; i < outsize; i++) {
7983 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7984 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007985 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007986 encoding, unicode,
7987 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007988 "unable to encode error handler result to ASCII");
7989 Py_DECREF(rep);
7990 goto error;
7991 }
7992 *out = (unsigned char)ch;
7993 out++;
7994 }
7995 }
7996 Py_DECREF(rep);
7997 }
7998 /* write a NUL byte */
7999 *out = 0;
8000 outsize = out - PyBytes_AS_STRING(*outbytes);
8001 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8002 if (_PyBytes_Resize(outbytes, outsize) < 0)
8003 goto error;
8004 ret = 0;
8005
8006error:
8007 Py_XDECREF(encoding_obj);
8008 Py_XDECREF(errorHandler);
8009 Py_XDECREF(exc);
8010 return ret;
8011}
8012
Victor Stinner3a50e702011-10-18 21:21:00 +02008013static PyObject *
8014encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008015 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008016 const char *errors)
8017{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008018 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008019 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008020 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008021 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008022
Victor Stinner29dacf22015-01-26 16:41:32 +01008023 if (!PyUnicode_Check(unicode)) {
8024 PyErr_BadArgument();
8025 return NULL;
8026 }
8027
Benjamin Petersonbac79492012-01-14 13:34:47 -05008028 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008029 return NULL;
8030 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008031
Victor Stinner3a50e702011-10-18 21:21:00 +02008032 if (code_page < 0) {
8033 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8034 return NULL;
8035 }
8036
Martin v. Löwis3d325192011-11-04 18:23:06 +01008037 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008038 return PyBytes_FromStringAndSize(NULL, 0);
8039
Victor Stinner7581cef2011-11-03 22:32:33 +01008040 offset = 0;
8041 do
8042 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008043#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008044 if (len > DECODING_CHUNK_SIZE) {
8045 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008046 done = 0;
8047 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008048 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008049#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008050 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008051 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008052 done = 1;
8053 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008054
Victor Stinner76a31a62011-11-04 00:05:13 +01008055 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008056 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008057 errors);
8058 if (ret == -2)
8059 ret = encode_code_page_errors(code_page, &outbytes,
8060 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008061 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008062 if (ret < 0) {
8063 Py_XDECREF(outbytes);
8064 return NULL;
8065 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008066
Victor Stinner7581cef2011-11-03 22:32:33 +01008067 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008068 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008069 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008070
Victor Stinner3a50e702011-10-18 21:21:00 +02008071 return outbytes;
8072}
8073
8074PyObject *
8075PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8076 Py_ssize_t size,
8077 const char *errors)
8078{
Victor Stinner7581cef2011-11-03 22:32:33 +01008079 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008080 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008081 if (unicode == NULL)
8082 return NULL;
8083 res = encode_code_page(CP_ACP, unicode, errors);
8084 Py_DECREF(unicode);
8085 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008086}
8087
8088PyObject *
8089PyUnicode_EncodeCodePage(int code_page,
8090 PyObject *unicode,
8091 const char *errors)
8092{
Victor Stinner7581cef2011-11-03 22:32:33 +01008093 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008094}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008095
Alexander Belopolsky40018472011-02-26 01:02:56 +00008096PyObject *
8097PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008098{
Victor Stinner7581cef2011-11-03 22:32:33 +01008099 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008100}
8101
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008102#undef NEED_RETRY
8103
Steve Dowercc16be82016-09-08 10:35:16 -07008104#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008105
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106/* --- Character Mapping Codec -------------------------------------------- */
8107
Victor Stinnerfb161b12013-04-18 01:44:27 +02008108static int
8109charmap_decode_string(const char *s,
8110 Py_ssize_t size,
8111 PyObject *mapping,
8112 const char *errors,
8113 _PyUnicodeWriter *writer)
8114{
8115 const char *starts = s;
8116 const char *e;
8117 Py_ssize_t startinpos, endinpos;
8118 PyObject *errorHandler = NULL, *exc = NULL;
8119 Py_ssize_t maplen;
8120 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008121 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008122 Py_UCS4 x;
8123 unsigned char ch;
8124
8125 if (PyUnicode_READY(mapping) == -1)
8126 return -1;
8127
8128 maplen = PyUnicode_GET_LENGTH(mapping);
8129 mapdata = PyUnicode_DATA(mapping);
8130 mapkind = PyUnicode_KIND(mapping);
8131
8132 e = s + size;
8133
8134 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8135 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8136 * is disabled in encoding aliases, latin1 is preferred because
8137 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008138 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008139 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8140 Py_UCS4 maxchar = writer->maxchar;
8141
8142 assert (writer->kind == PyUnicode_1BYTE_KIND);
8143 while (s < e) {
8144 ch = *s;
8145 x = mapdata_ucs1[ch];
8146 if (x > maxchar) {
8147 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8148 goto onError;
8149 maxchar = writer->maxchar;
8150 outdata = (Py_UCS1 *)writer->data;
8151 }
8152 outdata[writer->pos] = x;
8153 writer->pos++;
8154 ++s;
8155 }
8156 return 0;
8157 }
8158
8159 while (s < e) {
8160 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8161 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008162 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008163 if (outkind == PyUnicode_1BYTE_KIND) {
8164 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8165 Py_UCS4 maxchar = writer->maxchar;
8166 while (s < e) {
8167 ch = *s;
8168 x = mapdata_ucs2[ch];
8169 if (x > maxchar)
8170 goto Error;
8171 outdata[writer->pos] = x;
8172 writer->pos++;
8173 ++s;
8174 }
8175 break;
8176 }
8177 else if (outkind == PyUnicode_2BYTE_KIND) {
8178 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8179 while (s < e) {
8180 ch = *s;
8181 x = mapdata_ucs2[ch];
8182 if (x == 0xFFFE)
8183 goto Error;
8184 outdata[writer->pos] = x;
8185 writer->pos++;
8186 ++s;
8187 }
8188 break;
8189 }
8190 }
8191 ch = *s;
8192
8193 if (ch < maplen)
8194 x = PyUnicode_READ(mapkind, mapdata, ch);
8195 else
8196 x = 0xfffe; /* invalid value */
8197Error:
8198 if (x == 0xfffe)
8199 {
8200 /* undefined mapping */
8201 startinpos = s-starts;
8202 endinpos = startinpos+1;
8203 if (unicode_decode_call_errorhandler_writer(
8204 errors, &errorHandler,
8205 "charmap", "character maps to <undefined>",
8206 &starts, &e, &startinpos, &endinpos, &exc, &s,
8207 writer)) {
8208 goto onError;
8209 }
8210 continue;
8211 }
8212
8213 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8214 goto onError;
8215 ++s;
8216 }
8217 Py_XDECREF(errorHandler);
8218 Py_XDECREF(exc);
8219 return 0;
8220
8221onError:
8222 Py_XDECREF(errorHandler);
8223 Py_XDECREF(exc);
8224 return -1;
8225}
8226
8227static int
8228charmap_decode_mapping(const char *s,
8229 Py_ssize_t size,
8230 PyObject *mapping,
8231 const char *errors,
8232 _PyUnicodeWriter *writer)
8233{
8234 const char *starts = s;
8235 const char *e;
8236 Py_ssize_t startinpos, endinpos;
8237 PyObject *errorHandler = NULL, *exc = NULL;
8238 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008239 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008240
8241 e = s + size;
8242
8243 while (s < e) {
8244 ch = *s;
8245
8246 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8247 key = PyLong_FromLong((long)ch);
8248 if (key == NULL)
8249 goto onError;
8250
8251 item = PyObject_GetItem(mapping, key);
8252 Py_DECREF(key);
8253 if (item == NULL) {
8254 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8255 /* No mapping found means: mapping is undefined. */
8256 PyErr_Clear();
8257 goto Undefined;
8258 } else
8259 goto onError;
8260 }
8261
8262 /* Apply mapping */
8263 if (item == Py_None)
8264 goto Undefined;
8265 if (PyLong_Check(item)) {
8266 long value = PyLong_AS_LONG(item);
8267 if (value == 0xFFFE)
8268 goto Undefined;
8269 if (value < 0 || value > MAX_UNICODE) {
8270 PyErr_Format(PyExc_TypeError,
8271 "character mapping must be in range(0x%lx)",
8272 (unsigned long)MAX_UNICODE + 1);
8273 goto onError;
8274 }
8275
8276 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8277 goto onError;
8278 }
8279 else if (PyUnicode_Check(item)) {
8280 if (PyUnicode_READY(item) == -1)
8281 goto onError;
8282 if (PyUnicode_GET_LENGTH(item) == 1) {
8283 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8284 if (value == 0xFFFE)
8285 goto Undefined;
8286 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8287 goto onError;
8288 }
8289 else {
8290 writer->overallocate = 1;
8291 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8292 goto onError;
8293 }
8294 }
8295 else {
8296 /* wrong return value */
8297 PyErr_SetString(PyExc_TypeError,
8298 "character mapping must return integer, None or str");
8299 goto onError;
8300 }
8301 Py_CLEAR(item);
8302 ++s;
8303 continue;
8304
8305Undefined:
8306 /* undefined mapping */
8307 Py_CLEAR(item);
8308 startinpos = s-starts;
8309 endinpos = startinpos+1;
8310 if (unicode_decode_call_errorhandler_writer(
8311 errors, &errorHandler,
8312 "charmap", "character maps to <undefined>",
8313 &starts, &e, &startinpos, &endinpos, &exc, &s,
8314 writer)) {
8315 goto onError;
8316 }
8317 }
8318 Py_XDECREF(errorHandler);
8319 Py_XDECREF(exc);
8320 return 0;
8321
8322onError:
8323 Py_XDECREF(item);
8324 Py_XDECREF(errorHandler);
8325 Py_XDECREF(exc);
8326 return -1;
8327}
8328
Alexander Belopolsky40018472011-02-26 01:02:56 +00008329PyObject *
8330PyUnicode_DecodeCharmap(const char *s,
8331 Py_ssize_t size,
8332 PyObject *mapping,
8333 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008335 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008336
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 /* Default to Latin-1 */
8338 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008342 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008343 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008344 writer.min_length = size;
8345 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008346 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008347
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008348 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008349 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8350 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008351 }
8352 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008353 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8354 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008356 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008357
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008359 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360 return NULL;
8361}
8362
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363/* Charmap encoding: the lookup table */
8364
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 PyObject_HEAD
8367 unsigned char level1[32];
8368 int count2, count3;
8369 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370};
8371
8372static PyObject*
8373encoding_map_size(PyObject *obj, PyObject* args)
8374{
8375 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378}
8379
8380static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008381 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 PyDoc_STR("Return the size (in bytes) of this object") },
8383 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384};
8385
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008386static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008387 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 "EncodingMap", /*tp_name*/
8389 sizeof(struct encoding_map), /*tp_basicsize*/
8390 0, /*tp_itemsize*/
8391 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008392 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008393 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 0, /*tp_getattr*/
8395 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008396 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 0, /*tp_repr*/
8398 0, /*tp_as_number*/
8399 0, /*tp_as_sequence*/
8400 0, /*tp_as_mapping*/
8401 0, /*tp_hash*/
8402 0, /*tp_call*/
8403 0, /*tp_str*/
8404 0, /*tp_getattro*/
8405 0, /*tp_setattro*/
8406 0, /*tp_as_buffer*/
8407 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8408 0, /*tp_doc*/
8409 0, /*tp_traverse*/
8410 0, /*tp_clear*/
8411 0, /*tp_richcompare*/
8412 0, /*tp_weaklistoffset*/
8413 0, /*tp_iter*/
8414 0, /*tp_iternext*/
8415 encoding_map_methods, /*tp_methods*/
8416 0, /*tp_members*/
8417 0, /*tp_getset*/
8418 0, /*tp_base*/
8419 0, /*tp_dict*/
8420 0, /*tp_descr_get*/
8421 0, /*tp_descr_set*/
8422 0, /*tp_dictoffset*/
8423 0, /*tp_init*/
8424 0, /*tp_alloc*/
8425 0, /*tp_new*/
8426 0, /*tp_free*/
8427 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008428};
8429
8430PyObject*
8431PyUnicode_BuildEncodingMap(PyObject* string)
8432{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008433 PyObject *result;
8434 struct encoding_map *mresult;
8435 int i;
8436 int need_dict = 0;
8437 unsigned char level1[32];
8438 unsigned char level2[512];
8439 unsigned char *mlevel1, *mlevel2, *mlevel3;
8440 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008442 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008443 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008446 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008447 PyErr_BadArgument();
8448 return NULL;
8449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 kind = PyUnicode_KIND(string);
8451 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008452 length = PyUnicode_GET_LENGTH(string);
8453 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008454 memset(level1, 0xFF, sizeof level1);
8455 memset(level2, 0xFF, sizeof level2);
8456
8457 /* If there isn't a one-to-one mapping of NULL to \0,
8458 or if there are non-BMP characters, we need to use
8459 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008461 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008462 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008463 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 ch = PyUnicode_READ(kind, data, i);
8465 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008466 need_dict = 1;
8467 break;
8468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008470 /* unmapped character */
8471 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 l1 = ch >> 11;
8473 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008474 if (level1[l1] == 0xFF)
8475 level1[l1] = count2++;
8476 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008478 }
8479
8480 if (count2 >= 0xFF || count3 >= 0xFF)
8481 need_dict = 1;
8482
8483 if (need_dict) {
8484 PyObject *result = PyDict_New();
8485 PyObject *key, *value;
8486 if (!result)
8487 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008488 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008490 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008491 if (!key || !value)
8492 goto failed1;
8493 if (PyDict_SetItem(result, key, value) == -1)
8494 goto failed1;
8495 Py_DECREF(key);
8496 Py_DECREF(value);
8497 }
8498 return result;
8499 failed1:
8500 Py_XDECREF(key);
8501 Py_XDECREF(value);
8502 Py_DECREF(result);
8503 return NULL;
8504 }
8505
8506 /* Create a three-level trie */
8507 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8508 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008509 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008510 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008511 }
8512
8513 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008514 mresult = (struct encoding_map*)result;
8515 mresult->count2 = count2;
8516 mresult->count3 = count3;
8517 mlevel1 = mresult->level1;
8518 mlevel2 = mresult->level23;
8519 mlevel3 = mresult->level23 + 16*count2;
8520 memcpy(mlevel1, level1, 32);
8521 memset(mlevel2, 0xFF, 16*count2);
8522 memset(mlevel3, 0, 128*count3);
8523 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008524 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008525 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008526 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8527 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008528 /* unmapped character */
8529 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008530 o1 = ch>>11;
8531 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008532 i2 = 16*mlevel1[o1] + o2;
8533 if (mlevel2[i2] == 0xFF)
8534 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008535 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008536 i3 = 128*mlevel2[i2] + o3;
8537 mlevel3[i3] = i;
8538 }
8539 return result;
8540}
8541
8542static int
Victor Stinner22168992011-11-20 17:09:18 +01008543encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008544{
8545 struct encoding_map *map = (struct encoding_map*)mapping;
8546 int l1 = c>>11;
8547 int l2 = (c>>7) & 0xF;
8548 int l3 = c & 0x7F;
8549 int i;
8550
Victor Stinner22168992011-11-20 17:09:18 +01008551 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008553 if (c == 0)
8554 return 0;
8555 /* level 1*/
8556 i = map->level1[l1];
8557 if (i == 0xFF) {
8558 return -1;
8559 }
8560 /* level 2*/
8561 i = map->level23[16*i+l2];
8562 if (i == 0xFF) {
8563 return -1;
8564 }
8565 /* level 3 */
8566 i = map->level23[16*map->count2 + 128*i + l3];
8567 if (i == 0) {
8568 return -1;
8569 }
8570 return i;
8571}
8572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573/* Lookup the character ch in the mapping. If the character
8574 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008575 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008576static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008577charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578{
Christian Heimes217cfd12007-12-02 14:31:20 +00008579 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 PyObject *x;
8581
8582 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 x = PyObject_GetItem(mapping, w);
8585 Py_DECREF(w);
8586 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8588 /* No mapping found means: mapping is undefined. */
8589 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008590 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 } else
8592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008594 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008596 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 long value = PyLong_AS_LONG(x);
8598 if (value < 0 || value > 255) {
8599 PyErr_SetString(PyExc_TypeError,
8600 "character mapping must be in range(256)");
8601 Py_DECREF(x);
8602 return NULL;
8603 }
8604 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008606 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 /* wrong return value */
8610 PyErr_Format(PyExc_TypeError,
8611 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008612 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 Py_DECREF(x);
8614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615 }
8616}
8617
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008618static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008619charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008620{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008621 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8622 /* exponentially overallocate to minimize reallocations */
8623 if (requiredsize < 2*outsize)
8624 requiredsize = 2*outsize;
8625 if (_PyBytes_Resize(outobj, requiredsize))
8626 return -1;
8627 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008628}
8629
Benjamin Peterson14339b62009-01-31 16:36:08 +00008630typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008632} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008634 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 space is available. Return a new reference to the object that
8636 was put in the output buffer, or Py_None, if the mapping was undefined
8637 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008638 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008640charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008641 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008643 PyObject *rep;
8644 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008645 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646
Andy Lesterdffe4c02020-03-04 07:15:20 -06008647 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008648 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008650 if (res == -1)
8651 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 if (outsize<requiredsize)
8653 if (charmapencode_resize(outobj, outpos, requiredsize))
8654 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008655 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 outstart[(*outpos)++] = (char)res;
8657 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008658 }
8659
8660 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008663 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 Py_DECREF(rep);
8665 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008666 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 if (PyLong_Check(rep)) {
8668 Py_ssize_t requiredsize = *outpos+1;
8669 if (outsize<requiredsize)
8670 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8671 Py_DECREF(rep);
8672 return enc_EXCEPTION;
8673 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008674 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008676 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 else {
8678 const char *repchars = PyBytes_AS_STRING(rep);
8679 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8680 Py_ssize_t requiredsize = *outpos+repsize;
8681 if (outsize<requiredsize)
8682 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8683 Py_DECREF(rep);
8684 return enc_EXCEPTION;
8685 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008686 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 memcpy(outstart + *outpos, repchars, repsize);
8688 *outpos += repsize;
8689 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008691 Py_DECREF(rep);
8692 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693}
8694
8695/* handle an error in PyUnicode_EncodeCharmap
8696 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008697static int
8698charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008699 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008701 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008702 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703{
8704 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008705 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008706 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008707 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008708 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008709 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008711 Py_ssize_t collstartpos = *inpos;
8712 Py_ssize_t collendpos = *inpos+1;
8713 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008714 const char *encoding = "charmap";
8715 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008716 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008717 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008718 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719
Benjamin Petersonbac79492012-01-14 13:34:47 -05008720 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008721 return -1;
8722 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 /* find all unencodable characters */
8724 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008725 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008726 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008727 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008728 val = encoding_map_lookup(ch, mapping);
8729 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 break;
8731 ++collendpos;
8732 continue;
8733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008734
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008735 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8736 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 if (rep==NULL)
8738 return -1;
8739 else if (rep!=Py_None) {
8740 Py_DECREF(rep);
8741 break;
8742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008743 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 }
8746 /* cache callback name lookup
8747 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008748 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008749 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008750
8751 switch (*error_handler) {
8752 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008753 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008754 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008755
8756 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 x = charmapencode_output('?', mapping, res, respos);
8759 if (x==enc_EXCEPTION) {
8760 return -1;
8761 }
8762 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008763 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 return -1;
8765 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008766 }
8767 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008768 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008769 *inpos = collendpos;
8770 break;
Victor Stinner50149202015-09-22 00:26:54 +02008771
8772 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008773 /* generate replacement (temporarily (mis)uses p) */
8774 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 char buffer[2+29+1+1];
8776 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008777 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 for (cp = buffer; *cp; ++cp) {
8779 x = charmapencode_output(*cp, mapping, res, respos);
8780 if (x==enc_EXCEPTION)
8781 return -1;
8782 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008783 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return -1;
8785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008786 }
8787 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008788 *inpos = collendpos;
8789 break;
Victor Stinner50149202015-09-22 00:26:54 +02008790
Benjamin Peterson14339b62009-01-31 16:36:08 +00008791 default:
Victor Stinner50149202015-09-22 00:26:54 +02008792 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008793 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008795 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008797 if (PyBytes_Check(repunicode)) {
8798 /* Directly copy bytes result to output. */
8799 Py_ssize_t outsize = PyBytes_Size(*res);
8800 Py_ssize_t requiredsize;
8801 repsize = PyBytes_Size(repunicode);
8802 requiredsize = *respos + repsize;
8803 if (requiredsize > outsize)
8804 /* Make room for all additional bytes. */
8805 if (charmapencode_resize(res, respos, requiredsize)) {
8806 Py_DECREF(repunicode);
8807 return -1;
8808 }
8809 memcpy(PyBytes_AsString(*res) + *respos,
8810 PyBytes_AsString(repunicode), repsize);
8811 *respos += repsize;
8812 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008813 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008814 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008815 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008816 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008817 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008818 Py_DECREF(repunicode);
8819 return -1;
8820 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008821 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008822 data = PyUnicode_DATA(repunicode);
8823 kind = PyUnicode_KIND(repunicode);
8824 for (index = 0; index < repsize; index++) {
8825 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8826 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008828 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 return -1;
8830 }
8831 else if (x==enc_FAILED) {
8832 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008833 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 return -1;
8835 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008836 }
8837 *inpos = newpos;
8838 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008839 }
8840 return 0;
8841}
8842
Alexander Belopolsky40018472011-02-26 01:02:56 +00008843PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008844_PyUnicode_EncodeCharmap(PyObject *unicode,
8845 PyObject *mapping,
8846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008848 /* output object */
8849 PyObject *res = NULL;
8850 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008851 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008852 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008853 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008854 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008855 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008856 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008857 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008858 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008859 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860
Benjamin Petersonbac79492012-01-14 13:34:47 -05008861 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008862 return NULL;
8863 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008864 data = PyUnicode_DATA(unicode);
8865 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008866
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 /* Default to Latin-1 */
8868 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008869 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871 /* allocate enough for a simple encoding without
8872 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008873 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008874 if (res == NULL)
8875 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008876 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008879 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008880 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008882 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 if (x==enc_EXCEPTION) /* error */
8884 goto onError;
8885 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008886 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008888 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 &res, &respos)) {
8890 goto onError;
8891 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008892 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 else
8894 /* done with this character => adjust input position */
8895 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008898 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008899 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008900 if (_PyBytes_Resize(&res, respos) < 0)
8901 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008902
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008903 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008904 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905 return res;
8906
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 Py_XDECREF(res);
8909 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008910 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 return NULL;
8912}
8913
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008914/* Deprecated */
8915PyObject *
8916PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8917 Py_ssize_t size,
8918 PyObject *mapping,
8919 const char *errors)
8920{
8921 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008922 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008923 if (unicode == NULL)
8924 return NULL;
8925 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8926 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008927 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008928}
8929
Alexander Belopolsky40018472011-02-26 01:02:56 +00008930PyObject *
8931PyUnicode_AsCharmapString(PyObject *unicode,
8932 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933{
8934 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 PyErr_BadArgument();
8936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008938 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939}
8940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008942static void
8943make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008945 Py_ssize_t startpos, Py_ssize_t endpos,
8946 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008948 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 *exceptionObject = _PyUnicodeTranslateError_Create(
8950 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 }
8952 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8954 goto onError;
8955 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8956 goto onError;
8957 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8958 goto onError;
8959 return;
8960 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008961 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
8963}
8964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965/* error handling callback helper:
8966 build arguments, call the callback and check the arguments,
8967 put the result into newpos and return the replacement string, which
8968 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008969static PyObject *
8970unicode_translate_call_errorhandler(const char *errors,
8971 PyObject **errorHandler,
8972 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008974 Py_ssize_t startpos, Py_ssize_t endpos,
8975 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008977 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008978
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008979 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 PyObject *restuple;
8981 PyObject *resunicode;
8982
8983 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008987 }
8988
8989 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008991 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008993
Petr Viktorinffd97532020-02-11 17:46:57 +01008994 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008995 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008997 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008998 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 Py_DECREF(restuple);
9000 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009001 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009002 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 &resunicode, &i_newpos)) {
9004 Py_DECREF(restuple);
9005 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009006 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009007 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009009 else
9010 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009012 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 Py_DECREF(restuple);
9014 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009015 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009016 Py_INCREF(resunicode);
9017 Py_DECREF(restuple);
9018 return resunicode;
9019}
9020
9021/* Lookup the character ch in the mapping and put the result in result,
9022 which must be decrefed by the caller.
9023 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009024static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009026{
Christian Heimes217cfd12007-12-02 14:31:20 +00009027 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009028 PyObject *x;
9029
9030 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032 x = PyObject_GetItem(mapping, w);
9033 Py_DECREF(w);
9034 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9036 /* No mapping found means: use 1:1 mapping. */
9037 PyErr_Clear();
9038 *result = NULL;
9039 return 0;
9040 } else
9041 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042 }
9043 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 *result = x;
9045 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009046 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009047 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009048 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009049 if (value < 0 || value > MAX_UNICODE) {
9050 PyErr_Format(PyExc_ValueError,
9051 "character mapping must be in range(0x%x)",
9052 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 Py_DECREF(x);
9054 return -1;
9055 }
9056 *result = x;
9057 return 0;
9058 }
9059 else if (PyUnicode_Check(x)) {
9060 *result = x;
9061 return 0;
9062 }
9063 else {
9064 /* wrong return value */
9065 PyErr_SetString(PyExc_TypeError,
9066 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009067 Py_DECREF(x);
9068 return -1;
9069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009070}
Victor Stinner1194ea02014-04-04 19:37:40 +02009071
9072/* lookup the character, write the result into the writer.
9073 Return 1 if the result was written into the writer, return 0 if the mapping
9074 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009075static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009076charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9077 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009078{
Victor Stinner1194ea02014-04-04 19:37:40 +02009079 PyObject *item;
9080
9081 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009083
9084 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009086 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009089 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009090 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009091
9092 if (item == Py_None) {
9093 Py_DECREF(item);
9094 return 0;
9095 }
9096
9097 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009098 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9099 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9100 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009101 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9102 Py_DECREF(item);
9103 return -1;
9104 }
9105 Py_DECREF(item);
9106 return 1;
9107 }
9108
9109 if (!PyUnicode_Check(item)) {
9110 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009112 }
9113
9114 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9115 Py_DECREF(item);
9116 return -1;
9117 }
9118
9119 Py_DECREF(item);
9120 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009121}
9122
Victor Stinner89a76ab2014-04-05 11:44:04 +02009123static int
9124unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9125 Py_UCS1 *translate)
9126{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009127 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009128 int ret = 0;
9129
Victor Stinner89a76ab2014-04-05 11:44:04 +02009130 if (charmaptranslate_lookup(ch, mapping, &item)) {
9131 return -1;
9132 }
9133
9134 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009135 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009136 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009137 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009138 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009139 /* not found => default to 1:1 mapping */
9140 translate[ch] = ch;
9141 return 1;
9142 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009143 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009144 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009145 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9146 used it */
9147 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009148 /* invalid character or character outside ASCII:
9149 skip the fast translate */
9150 goto exit;
9151 }
9152 translate[ch] = (Py_UCS1)replace;
9153 }
9154 else if (PyUnicode_Check(item)) {
9155 Py_UCS4 replace;
9156
9157 if (PyUnicode_READY(item) == -1) {
9158 Py_DECREF(item);
9159 return -1;
9160 }
9161 if (PyUnicode_GET_LENGTH(item) != 1)
9162 goto exit;
9163
9164 replace = PyUnicode_READ_CHAR(item, 0);
9165 if (replace > 127)
9166 goto exit;
9167 translate[ch] = (Py_UCS1)replace;
9168 }
9169 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009170 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009171 goto exit;
9172 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009173 ret = 1;
9174
Benjamin Peterson1365de72014-04-07 20:15:41 -04009175 exit:
9176 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009177 return ret;
9178}
9179
9180/* Fast path for ascii => ascii translation. Return 1 if the whole string
9181 was translated into writer, return 0 if the input string was partially
9182 translated into writer, raise an exception and return -1 on error. */
9183static int
9184unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009185 _PyUnicodeWriter *writer, int ignore,
9186 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009187{
Victor Stinner872b2912014-04-05 14:27:07 +02009188 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009189 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009190 const Py_UCS1 *in, *end;
9191 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009192 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009193
Victor Stinner89a76ab2014-04-05 11:44:04 +02009194 len = PyUnicode_GET_LENGTH(input);
9195
Victor Stinner872b2912014-04-05 14:27:07 +02009196 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009197
9198 in = PyUnicode_1BYTE_DATA(input);
9199 end = in + len;
9200
9201 assert(PyUnicode_IS_ASCII(writer->buffer));
9202 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9203 out = PyUnicode_1BYTE_DATA(writer->buffer);
9204
Victor Stinner872b2912014-04-05 14:27:07 +02009205 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009206 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009207 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009208 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009209 int translate = unicode_fast_translate_lookup(mapping, ch,
9210 ascii_table);
9211 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009212 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009213 if (translate == 0)
9214 goto exit;
9215 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009216 }
Victor Stinner872b2912014-04-05 14:27:07 +02009217 if (ch2 == 0xfe) {
9218 if (ignore)
9219 continue;
9220 goto exit;
9221 }
9222 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009223 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009224 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009225 }
Victor Stinner872b2912014-04-05 14:27:07 +02009226 res = 1;
9227
9228exit:
9229 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009230 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009231 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009232}
9233
Victor Stinner3222da22015-10-01 22:07:32 +02009234static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235_PyUnicode_TranslateCharmap(PyObject *input,
9236 PyObject *mapping,
9237 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009240 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 Py_ssize_t size, i;
9242 int kind;
9243 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009244 _PyUnicodeWriter writer;
9245 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009246 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009247 PyObject *errorHandler = NULL;
9248 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009249 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009250 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009251
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009253 PyErr_BadArgument();
9254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 if (PyUnicode_READY(input) == -1)
9258 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009259 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260 kind = PyUnicode_KIND(input);
9261 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009263 if (size == 0)
9264 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009266 /* allocate enough for a simple 1:1 translation without
9267 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009268 _PyUnicodeWriter_Init(&writer);
9269 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271
Victor Stinner872b2912014-04-05 14:27:07 +02009272 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9273
Victor Stinner33798672016-03-01 21:59:58 +01009274 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009275 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009276 if (PyUnicode_IS_ASCII(input)) {
9277 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9278 if (res < 0) {
9279 _PyUnicodeWriter_Dealloc(&writer);
9280 return NULL;
9281 }
9282 if (res == 1)
9283 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009284 }
Victor Stinner33798672016-03-01 21:59:58 +01009285 else {
9286 i = 0;
9287 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009290 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009291 int translate;
9292 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9293 Py_ssize_t newpos;
9294 /* startpos for collecting untranslatable chars */
9295 Py_ssize_t collstart;
9296 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009297 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298
Victor Stinner1194ea02014-04-04 19:37:40 +02009299 ch = PyUnicode_READ(kind, data, i);
9300 translate = charmaptranslate_output(ch, mapping, &writer);
9301 if (translate < 0)
9302 goto onError;
9303
9304 if (translate != 0) {
9305 /* it worked => adjust input pointer */
9306 ++i;
9307 continue;
9308 }
9309
9310 /* untranslatable character */
9311 collstart = i;
9312 collend = i+1;
9313
9314 /* find all untranslatable characters */
9315 while (collend < size) {
9316 PyObject *x;
9317 ch = PyUnicode_READ(kind, data, collend);
9318 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009319 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009320 Py_XDECREF(x);
9321 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009323 ++collend;
9324 }
9325
9326 if (ignore) {
9327 i = collend;
9328 }
9329 else {
9330 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9331 reason, input, &exc,
9332 collstart, collend, &newpos);
9333 if (repunicode == NULL)
9334 goto onError;
9335 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009336 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009337 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009338 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009339 Py_DECREF(repunicode);
9340 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009341 }
9342 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009343 Py_XDECREF(exc);
9344 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009345 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009348 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009349 Py_XDECREF(exc);
9350 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351 return NULL;
9352}
9353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354/* Deprecated. Use PyUnicode_Translate instead. */
9355PyObject *
9356PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9357 Py_ssize_t size,
9358 PyObject *mapping,
9359 const char *errors)
9360{
Christian Heimes5f520f42012-09-11 14:03:25 +02009361 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009362 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 if (!unicode)
9364 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009365 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9366 Py_DECREF(unicode);
9367 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368}
9369
Alexander Belopolsky40018472011-02-26 01:02:56 +00009370PyObject *
9371PyUnicode_Translate(PyObject *str,
9372 PyObject *mapping,
9373 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009375 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009376 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009377 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378}
Tim Petersced69f82003-09-16 20:30:58 +00009379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380PyObject *
9381_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9382{
9383 if (!PyUnicode_Check(unicode)) {
9384 PyErr_BadInternalCall();
9385 return NULL;
9386 }
9387 if (PyUnicode_READY(unicode) == -1)
9388 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009389 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 /* If the string is already ASCII, just return the same string */
9391 Py_INCREF(unicode);
9392 return unicode;
9393 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009394
9395 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9396 PyObject *result = PyUnicode_New(len, 127);
9397 if (result == NULL) {
9398 return NULL;
9399 }
9400
9401 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9402 int kind = PyUnicode_KIND(unicode);
9403 const void *data = PyUnicode_DATA(unicode);
9404 Py_ssize_t i;
9405 for (i = 0; i < len; ++i) {
9406 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9407 if (ch < 127) {
9408 out[i] = ch;
9409 }
9410 else if (Py_UNICODE_ISSPACE(ch)) {
9411 out[i] = ' ';
9412 }
9413 else {
9414 int decimal = Py_UNICODE_TODECIMAL(ch);
9415 if (decimal < 0) {
9416 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009417 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009418 _PyUnicode_LENGTH(result) = i + 1;
9419 break;
9420 }
9421 out[i] = '0' + decimal;
9422 }
9423 }
9424
INADA Naoki16dfca42018-07-14 12:06:43 +09009425 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009426 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427}
9428
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009429PyObject *
9430PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9431 Py_ssize_t length)
9432{
Victor Stinnerf0124502011-11-21 23:12:56 +01009433 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009434 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009435 Py_UCS4 maxchar;
9436 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009437 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009438
Victor Stinner99d7ad02012-02-22 13:37:39 +01009439 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009440 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009441 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009442 if (ch > 127) {
9443 int decimal = Py_UNICODE_TODECIMAL(ch);
9444 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009445 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009446 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009447 }
9448 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009449
9450 /* Copy to a new string */
9451 decimal = PyUnicode_New(length, maxchar);
9452 if (decimal == NULL)
9453 return decimal;
9454 kind = PyUnicode_KIND(decimal);
9455 data = PyUnicode_DATA(decimal);
9456 /* Iterate over code points */
9457 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009458 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009459 if (ch > 127) {
9460 int decimal = Py_UNICODE_TODECIMAL(ch);
9461 if (decimal >= 0)
9462 ch = '0' + decimal;
9463 }
9464 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009466 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009467}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009468/* --- Decimal Encoder ---------------------------------------------------- */
9469
Alexander Belopolsky40018472011-02-26 01:02:56 +00009470int
9471PyUnicode_EncodeDecimal(Py_UNICODE *s,
9472 Py_ssize_t length,
9473 char *output,
9474 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009475{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009476 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009477 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009478 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009479 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009480
9481 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 PyErr_BadArgument();
9483 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009484 }
9485
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009486 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009487 if (unicode == NULL)
9488 return -1;
9489
Victor Stinner42bf7752011-11-21 22:52:58 +01009490 kind = PyUnicode_KIND(unicode);
9491 data = PyUnicode_DATA(unicode);
9492
Victor Stinnerb84d7232011-11-22 01:50:07 +01009493 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009494 PyObject *exc;
9495 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009497 Py_ssize_t startpos;
9498
9499 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009500
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009503 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009505 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 decimal = Py_UNICODE_TODECIMAL(ch);
9507 if (decimal >= 0) {
9508 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009509 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 continue;
9511 }
9512 if (0 < ch && ch < 256) {
9513 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009514 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 continue;
9516 }
Victor Stinner6345be92011-11-25 20:09:01 +01009517
Victor Stinner42bf7752011-11-21 22:52:58 +01009518 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009519 exc = NULL;
9520 raise_encode_exception(&exc, "decimal", unicode,
9521 startpos, startpos+1,
9522 "invalid decimal Unicode string");
9523 Py_XDECREF(exc);
9524 Py_DECREF(unicode);
9525 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009526 }
9527 /* 0-terminate the output string */
9528 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009529 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009530 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009531}
9532
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533/* --- Helpers ------------------------------------------------------------ */
9534
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009535/* helper macro to fixup start/end slice values */
9536#define ADJUST_INDICES(start, end, len) \
9537 if (end > len) \
9538 end = len; \
9539 else if (end < 0) { \
9540 end += len; \
9541 if (end < 0) \
9542 end = 0; \
9543 } \
9544 if (start < 0) { \
9545 start += len; \
9546 if (start < 0) \
9547 start = 0; \
9548 }
9549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009551any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009553 Py_ssize_t end,
9554 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009556 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009557 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 Py_ssize_t len1, len2, result;
9559
9560 kind1 = PyUnicode_KIND(s1);
9561 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009562 if (kind1 < kind2)
9563 return -1;
9564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 len1 = PyUnicode_GET_LENGTH(s1);
9566 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009567 ADJUST_INDICES(start, end, len1);
9568 if (end - start < len2)
9569 return -1;
9570
9571 buf1 = PyUnicode_DATA(s1);
9572 buf2 = PyUnicode_DATA(s2);
9573 if (len2 == 1) {
9574 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9575 result = findchar((const char *)buf1 + kind1*start,
9576 kind1, end - start, ch, direction);
9577 if (result == -1)
9578 return -1;
9579 else
9580 return start + result;
9581 }
9582
9583 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009584 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009585 if (!buf2)
9586 return -2;
9587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588
Victor Stinner794d5672011-10-10 03:21:36 +02009589 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009590 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009591 case PyUnicode_1BYTE_KIND:
9592 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9593 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9594 else
9595 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9596 break;
9597 case PyUnicode_2BYTE_KIND:
9598 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9599 break;
9600 case PyUnicode_4BYTE_KIND:
9601 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9602 break;
9603 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009604 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009605 }
9606 }
9607 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009608 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009609 case PyUnicode_1BYTE_KIND:
9610 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9611 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9612 else
9613 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9614 break;
9615 case PyUnicode_2BYTE_KIND:
9616 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9617 break;
9618 case PyUnicode_4BYTE_KIND:
9619 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9620 break;
9621 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009622 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 }
9625
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009626 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009627 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009628 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629
9630 return result;
9631}
9632
Victor Stinner59423e32018-11-26 13:40:01 +01009633/* _PyUnicode_InsertThousandsGrouping() helper functions */
9634#include "stringlib/localeutil.h"
9635
9636/**
9637 * InsertThousandsGrouping:
9638 * @writer: Unicode writer.
9639 * @n_buffer: Number of characters in @buffer.
9640 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9641 * @d_pos: Start of digits string.
9642 * @n_digits: The number of digits in the string, in which we want
9643 * to put the grouping chars.
9644 * @min_width: The minimum width of the digits in the output string.
9645 * Output will be zero-padded on the left to fill.
9646 * @grouping: see definition in localeconv().
9647 * @thousands_sep: see definition in localeconv().
9648 *
9649 * There are 2 modes: counting and filling. If @writer is NULL,
9650 * we are in counting mode, else filling mode.
9651 * If counting, the required buffer size is returned.
9652 * If filling, we know the buffer will be large enough, so we don't
9653 * need to pass in the buffer size.
9654 * Inserts thousand grouping characters (as defined by grouping and
9655 * thousands_sep) into @writer.
9656 *
9657 * Return value: -1 on error, number of characters otherwise.
9658 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009660_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009661 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009662 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009663 PyObject *digits,
9664 Py_ssize_t d_pos,
9665 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009666 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009667 const char *grouping,
9668 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009669 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670{
Xtreak3f7983a2019-01-07 20:39:14 +05309671 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009672 if (writer) {
9673 assert(digits != NULL);
9674 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009675 }
9676 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009677 assert(digits == NULL);
9678 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009679 }
Victor Stinner59423e32018-11-26 13:40:01 +01009680 assert(0 <= d_pos);
9681 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009682 assert(grouping != NULL);
9683
9684 if (digits != NULL) {
9685 if (PyUnicode_READY(digits) == -1) {
9686 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009687 }
Victor Stinner59423e32018-11-26 13:40:01 +01009688 }
9689 if (PyUnicode_READY(thousands_sep) == -1) {
9690 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009691 }
9692
Victor Stinner59423e32018-11-26 13:40:01 +01009693 Py_ssize_t count = 0;
9694 Py_ssize_t n_zeros;
9695 int loop_broken = 0;
9696 int use_separator = 0; /* First time through, don't append the
9697 separator. They only go between
9698 groups. */
9699 Py_ssize_t buffer_pos;
9700 Py_ssize_t digits_pos;
9701 Py_ssize_t len;
9702 Py_ssize_t n_chars;
9703 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9704 be looked at */
9705 /* A generator that returns all of the grouping widths, until it
9706 returns 0. */
9707 GroupGenerator groupgen;
9708 GroupGenerator_init(&groupgen, grouping);
9709 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9710
9711 /* if digits are not grouped, thousands separator
9712 should be an empty string */
9713 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9714
9715 digits_pos = d_pos + n_digits;
9716 if (writer) {
9717 buffer_pos = writer->pos + n_buffer;
9718 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9719 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 }
Victor Stinner59423e32018-11-26 13:40:01 +01009721 else {
9722 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009723 }
Victor Stinner59423e32018-11-26 13:40:01 +01009724
9725 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009726 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009727 }
Victor Stinner59423e32018-11-26 13:40:01 +01009728
9729 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9730 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9731 n_zeros = Py_MAX(0, len - remaining);
9732 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9733
9734 /* Use n_zero zero's and n_chars chars */
9735
9736 /* Count only, don't do anything. */
9737 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9738
9739 /* Copy into the writer. */
9740 InsertThousandsGrouping_fill(writer, &buffer_pos,
9741 digits, &digits_pos,
9742 n_chars, n_zeros,
9743 use_separator ? thousands_sep : NULL,
9744 thousands_sep_len, maxchar);
9745
9746 /* Use a separator next time. */
9747 use_separator = 1;
9748
9749 remaining -= n_chars;
9750 min_width -= len;
9751
9752 if (remaining <= 0 && min_width <= 0) {
9753 loop_broken = 1;
9754 break;
9755 }
9756 min_width -= thousands_sep_len;
9757 }
9758 if (!loop_broken) {
9759 /* We left the loop without using a break statement. */
9760
9761 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9762 n_zeros = Py_MAX(0, len - remaining);
9763 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9764
9765 /* Use n_zero zero's and n_chars chars */
9766 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9767
9768 /* Copy into the writer. */
9769 InsertThousandsGrouping_fill(writer, &buffer_pos,
9770 digits, &digits_pos,
9771 n_chars, n_zeros,
9772 use_separator ? thousands_sep : NULL,
9773 thousands_sep_len, maxchar);
9774 }
9775 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776}
9777
9778
Alexander Belopolsky40018472011-02-26 01:02:56 +00009779Py_ssize_t
9780PyUnicode_Count(PyObject *str,
9781 PyObject *substr,
9782 Py_ssize_t start,
9783 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009785 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009786 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009787 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009789
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009790 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009792
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009793 kind1 = PyUnicode_KIND(str);
9794 kind2 = PyUnicode_KIND(substr);
9795 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009796 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009797
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009798 len1 = PyUnicode_GET_LENGTH(str);
9799 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009801 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009802 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009803
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009804 buf1 = PyUnicode_DATA(str);
9805 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009806 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009807 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009808 if (!buf2)
9809 goto onError;
9810 }
9811
9812 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009814 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009815 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009816 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009817 buf2, len2, PY_SSIZE_T_MAX
9818 );
9819 else
9820 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009821 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009822 buf2, len2, PY_SSIZE_T_MAX
9823 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 break;
9825 case PyUnicode_2BYTE_KIND:
9826 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009827 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 buf2, len2, PY_SSIZE_T_MAX
9829 );
9830 break;
9831 case PyUnicode_4BYTE_KIND:
9832 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009833 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 buf2, len2, PY_SSIZE_T_MAX
9835 );
9836 break;
9837 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009838 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009840
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009841 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009842 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009843 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009847 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9848 if (kind2 != kind1)
9849 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851}
9852
Alexander Belopolsky40018472011-02-26 01:02:56 +00009853Py_ssize_t
9854PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009855 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009856 Py_ssize_t start,
9857 Py_ssize_t end,
9858 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009860 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009861 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009862
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009863 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864}
9865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866Py_ssize_t
9867PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9868 Py_ssize_t start, Py_ssize_t end,
9869 int direction)
9870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009872 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 if (PyUnicode_READY(str) == -1)
9874 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009875 len = PyUnicode_GET_LENGTH(str);
9876 ADJUST_INDICES(start, end, len);
9877 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009878 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009880 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9881 kind, end-start, ch, direction);
9882 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009884 else
9885 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886}
9887
Alexander Belopolsky40018472011-02-26 01:02:56 +00009888static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009889tailmatch(PyObject *self,
9890 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009891 Py_ssize_t start,
9892 Py_ssize_t end,
9893 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 int kind_self;
9896 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009897 const void *data_self;
9898 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 Py_ssize_t offset;
9900 Py_ssize_t i;
9901 Py_ssize_t end_sub;
9902
9903 if (PyUnicode_READY(self) == -1 ||
9904 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009905 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9908 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009912 if (PyUnicode_GET_LENGTH(substring) == 0)
9913 return 1;
9914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 kind_self = PyUnicode_KIND(self);
9916 data_self = PyUnicode_DATA(self);
9917 kind_sub = PyUnicode_KIND(substring);
9918 data_sub = PyUnicode_DATA(substring);
9919 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9920
9921 if (direction > 0)
9922 offset = end;
9923 else
9924 offset = start;
9925
9926 if (PyUnicode_READ(kind_self, data_self, offset) ==
9927 PyUnicode_READ(kind_sub, data_sub, 0) &&
9928 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9929 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9930 /* If both are of the same kind, memcmp is sufficient */
9931 if (kind_self == kind_sub) {
9932 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009933 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 data_sub,
9935 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009936 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009938 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 else {
9940 /* We do not need to compare 0 and len(substring)-1 because
9941 the if statement above ensured already that they are equal
9942 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 for (i = 1; i < end_sub; ++i) {
9944 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9945 PyUnicode_READ(kind_sub, data_sub, i))
9946 return 0;
9947 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009948 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950 }
9951
9952 return 0;
9953}
9954
Alexander Belopolsky40018472011-02-26 01:02:56 +00009955Py_ssize_t
9956PyUnicode_Tailmatch(PyObject *str,
9957 PyObject *substr,
9958 Py_ssize_t start,
9959 Py_ssize_t end,
9960 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009962 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009963 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009964
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009965 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966}
9967
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009968static PyObject *
9969ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009971 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009972 const char *data = PyUnicode_DATA(self);
9973 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009974 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009975
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009976 res = PyUnicode_New(len, 127);
9977 if (res == NULL)
9978 return NULL;
9979 resdata = PyUnicode_DATA(res);
9980 if (lower)
9981 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009983 _Py_bytes_upper(resdata, data, len);
9984 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985}
9986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009988handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009990 Py_ssize_t j;
9991 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009992 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009993 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009994
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009995 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9996
9997 where ! is a negation and \p{xxx} is a character with property xxx.
9998 */
9999 for (j = i - 1; j >= 0; j--) {
10000 c = PyUnicode_READ(kind, data, j);
10001 if (!_PyUnicode_IsCaseIgnorable(c))
10002 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010004 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10005 if (final_sigma) {
10006 for (j = i + 1; j < length; j++) {
10007 c = PyUnicode_READ(kind, data, j);
10008 if (!_PyUnicode_IsCaseIgnorable(c))
10009 break;
10010 }
10011 final_sigma = j == length || !_PyUnicode_IsCased(c);
10012 }
10013 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014}
10015
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010016static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010017lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010018 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010020 /* Obscure special case. */
10021 if (c == 0x3A3) {
10022 mapped[0] = handle_capital_sigma(kind, data, length, i);
10023 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010025 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026}
10027
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010028static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010029do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010031 Py_ssize_t i, k = 0;
10032 int n_res, j;
10033 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010034
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010035 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010036 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010037 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010038 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010039 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010041 for (i = 1; i < length; i++) {
10042 c = PyUnicode_READ(kind, data, i);
10043 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10044 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010045 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010046 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010047 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010048 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010049 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050}
10051
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010052static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010053do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010054 Py_ssize_t i, k = 0;
10055
10056 for (i = 0; i < length; i++) {
10057 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10058 int n_res, j;
10059 if (Py_UNICODE_ISUPPER(c)) {
10060 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10061 }
10062 else if (Py_UNICODE_ISLOWER(c)) {
10063 n_res = _PyUnicode_ToUpperFull(c, mapped);
10064 }
10065 else {
10066 n_res = 1;
10067 mapped[0] = c;
10068 }
10069 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010070 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010071 res[k++] = mapped[j];
10072 }
10073 }
10074 return k;
10075}
10076
10077static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010078do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010079 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010081 Py_ssize_t i, k = 0;
10082
10083 for (i = 0; i < length; i++) {
10084 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10085 int n_res, j;
10086 if (lower)
10087 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10088 else
10089 n_res = _PyUnicode_ToUpperFull(c, mapped);
10090 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010091 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010092 res[k++] = mapped[j];
10093 }
10094 }
10095 return k;
10096}
10097
10098static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010099do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010100{
10101 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10102}
10103
10104static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010105do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010106{
10107 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10108}
10109
Benjamin Petersone51757f2012-01-12 21:10:29 -050010110static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010111do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010112{
10113 Py_ssize_t i, k = 0;
10114
10115 for (i = 0; i < length; i++) {
10116 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10117 Py_UCS4 mapped[3];
10118 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10119 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010120 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010121 res[k++] = mapped[j];
10122 }
10123 }
10124 return k;
10125}
10126
10127static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010128do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010129{
10130 Py_ssize_t i, k = 0;
10131 int previous_is_cased;
10132
10133 previous_is_cased = 0;
10134 for (i = 0; i < length; i++) {
10135 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10136 Py_UCS4 mapped[3];
10137 int n_res, j;
10138
10139 if (previous_is_cased)
10140 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10141 else
10142 n_res = _PyUnicode_ToTitleFull(c, mapped);
10143
10144 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010145 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010146 res[k++] = mapped[j];
10147 }
10148
10149 previous_is_cased = _PyUnicode_IsCased(c);
10150 }
10151 return k;
10152}
10153
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010154static PyObject *
10155case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010156 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010157{
10158 PyObject *res = NULL;
10159 Py_ssize_t length, newlength = 0;
10160 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010161 const void *data;
10162 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010163 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10164
Benjamin Petersoneea48462012-01-16 14:28:50 -050010165 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010166
10167 kind = PyUnicode_KIND(self);
10168 data = PyUnicode_DATA(self);
10169 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010170 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010171 PyErr_SetString(PyExc_OverflowError, "string is too long");
10172 return NULL;
10173 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010174 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010175 if (tmp == NULL)
10176 return PyErr_NoMemory();
10177 newlength = perform(kind, data, length, tmp, &maxchar);
10178 res = PyUnicode_New(newlength, maxchar);
10179 if (res == NULL)
10180 goto leave;
10181 tmpend = tmp + newlength;
10182 outdata = PyUnicode_DATA(res);
10183 outkind = PyUnicode_KIND(res);
10184 switch (outkind) {
10185 case PyUnicode_1BYTE_KIND:
10186 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10187 break;
10188 case PyUnicode_2BYTE_KIND:
10189 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10190 break;
10191 case PyUnicode_4BYTE_KIND:
10192 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10193 break;
10194 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010195 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010196 }
10197 leave:
10198 PyMem_FREE(tmp);
10199 return res;
10200}
10201
Tim Peters8ce9f162004-08-27 01:49:32 +000010202PyObject *
10203PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010205 PyObject *res;
10206 PyObject *fseq;
10207 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010208 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010210 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010211 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010212 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010213 }
10214
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010215 /* NOTE: the following code can't call back into Python code,
10216 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010217 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010218
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010219 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010220 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010221 res = _PyUnicode_JoinArray(separator, items, seqlen);
10222 Py_DECREF(fseq);
10223 return res;
10224}
10225
10226PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010227_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010228{
10229 PyObject *res = NULL; /* the result */
10230 PyObject *sep = NULL;
10231 Py_ssize_t seplen;
10232 PyObject *item;
10233 Py_ssize_t sz, i, res_offset;
10234 Py_UCS4 maxchar;
10235 Py_UCS4 item_maxchar;
10236 int use_memcpy;
10237 unsigned char *res_data = NULL, *sep_data = NULL;
10238 PyObject *last_obj;
10239 unsigned int kind = 0;
10240
Tim Peters05eba1f2004-08-27 21:32:02 +000010241 /* If empty sequence, return u"". */
10242 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010243 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010244 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010245
Tim Peters05eba1f2004-08-27 21:32:02 +000010246 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010247 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010248 if (seqlen == 1) {
10249 if (PyUnicode_CheckExact(items[0])) {
10250 res = items[0];
10251 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010252 return res;
10253 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010254 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010255 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010256 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010257 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010258 /* Set up sep and seplen */
10259 if (separator == NULL) {
10260 /* fall back to a blank space separator */
10261 sep = PyUnicode_FromOrdinal(' ');
10262 if (!sep)
10263 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010264 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010265 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010266 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010267 else {
10268 if (!PyUnicode_Check(separator)) {
10269 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010270 "separator: expected str instance,"
10271 " %.80s found",
10272 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010273 goto onError;
10274 }
10275 if (PyUnicode_READY(separator))
10276 goto onError;
10277 sep = separator;
10278 seplen = PyUnicode_GET_LENGTH(separator);
10279 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10280 /* inc refcount to keep this code path symmetric with the
10281 above case of a blank separator */
10282 Py_INCREF(sep);
10283 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010284 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010285 }
10286
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010287 /* There are at least two things to join, or else we have a subclass
10288 * of str in the sequence.
10289 * Do a pre-pass to figure out the total amount of space we'll
10290 * need (sz), and see whether all argument are strings.
10291 */
10292 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010293#ifdef Py_DEBUG
10294 use_memcpy = 0;
10295#else
10296 use_memcpy = 1;
10297#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010298 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010299 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010300 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 if (!PyUnicode_Check(item)) {
10302 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010303 "sequence item %zd: expected str instance,"
10304 " %.80s found",
10305 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 goto onError;
10307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (PyUnicode_READY(item) == -1)
10309 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010310 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010312 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010313 if (i != 0) {
10314 add_sz += seplen;
10315 }
10316 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010317 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010318 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010319 goto onError;
10320 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010321 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010322 if (use_memcpy && last_obj != NULL) {
10323 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10324 use_memcpy = 0;
10325 }
10326 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010327 }
Tim Petersced69f82003-09-16 20:30:58 +000010328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010330 if (res == NULL)
10331 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010332
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010333 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010334#ifdef Py_DEBUG
10335 use_memcpy = 0;
10336#else
10337 if (use_memcpy) {
10338 res_data = PyUnicode_1BYTE_DATA(res);
10339 kind = PyUnicode_KIND(res);
10340 if (seplen != 0)
10341 sep_data = PyUnicode_1BYTE_DATA(sep);
10342 }
10343#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010344 if (use_memcpy) {
10345 for (i = 0; i < seqlen; ++i) {
10346 Py_ssize_t itemlen;
10347 item = items[i];
10348
10349 /* Copy item, and maybe the separator. */
10350 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010351 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010352 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 kind * seplen);
10354 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010355 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010356
10357 itemlen = PyUnicode_GET_LENGTH(item);
10358 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010359 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010360 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 kind * itemlen);
10362 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010363 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010364 }
10365 assert(res_data == PyUnicode_1BYTE_DATA(res)
10366 + kind * PyUnicode_GET_LENGTH(res));
10367 }
10368 else {
10369 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10370 Py_ssize_t itemlen;
10371 item = items[i];
10372
10373 /* Copy item, and maybe the separator. */
10374 if (i && seplen != 0) {
10375 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10376 res_offset += seplen;
10377 }
10378
10379 itemlen = PyUnicode_GET_LENGTH(item);
10380 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010381 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010382 res_offset += itemlen;
10383 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010384 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010385 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010386 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010389 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391
Benjamin Peterson29060642009-01-31 22:14:21 +000010392 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010394 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395 return NULL;
10396}
10397
Victor Stinnerd3f08822012-05-29 12:57:52 +020010398void
10399_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10400 Py_UCS4 fill_char)
10401{
10402 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010403 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010404 assert(PyUnicode_IS_READY(unicode));
10405 assert(unicode_modifiable(unicode));
10406 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10407 assert(start >= 0);
10408 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010409 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010410}
10411
Victor Stinner3fe55312012-01-04 00:33:50 +010010412Py_ssize_t
10413PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10414 Py_UCS4 fill_char)
10415{
10416 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010417
10418 if (!PyUnicode_Check(unicode)) {
10419 PyErr_BadInternalCall();
10420 return -1;
10421 }
10422 if (PyUnicode_READY(unicode) == -1)
10423 return -1;
10424 if (unicode_check_modifiable(unicode))
10425 return -1;
10426
Victor Stinnerd3f08822012-05-29 12:57:52 +020010427 if (start < 0) {
10428 PyErr_SetString(PyExc_IndexError, "string index out of range");
10429 return -1;
10430 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010431 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10432 PyErr_SetString(PyExc_ValueError,
10433 "fill character is bigger than "
10434 "the string maximum character");
10435 return -1;
10436 }
10437
10438 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10439 length = Py_MIN(maxlen, length);
10440 if (length <= 0)
10441 return 0;
10442
Victor Stinnerd3f08822012-05-29 12:57:52 +020010443 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010444 return length;
10445}
10446
Victor Stinner9310abb2011-10-05 00:59:23 +020010447static PyObject *
10448pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010449 Py_ssize_t left,
10450 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 PyObject *u;
10454 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010455 int kind;
10456 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457
10458 if (left < 0)
10459 left = 0;
10460 if (right < 0)
10461 right = 0;
10462
Victor Stinnerc4b49542011-12-11 22:44:26 +010010463 if (left == 0 && right == 0)
10464 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10467 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010468 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10469 return NULL;
10470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010472 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010474 if (!u)
10475 return NULL;
10476
10477 kind = PyUnicode_KIND(u);
10478 data = PyUnicode_DATA(u);
10479 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010480 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010481 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010482 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010483 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010484 assert(_PyUnicode_CheckConsistency(u, 1));
10485 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486}
10487
Alexander Belopolsky40018472011-02-26 01:02:56 +000010488PyObject *
10489PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010493 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495
Benjamin Petersonead6b532011-12-20 17:23:42 -060010496 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010498 if (PyUnicode_IS_ASCII(string))
10499 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010500 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010501 PyUnicode_GET_LENGTH(string), keepends);
10502 else
10503 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010504 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010505 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 break;
10507 case PyUnicode_2BYTE_KIND:
10508 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010509 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 PyUnicode_GET_LENGTH(string), keepends);
10511 break;
10512 case PyUnicode_4BYTE_KIND:
10513 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010514 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 PyUnicode_GET_LENGTH(string), keepends);
10516 break;
10517 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010518 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521}
10522
Alexander Belopolsky40018472011-02-26 01:02:56 +000010523static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010524split(PyObject *self,
10525 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010526 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010528 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010529 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 Py_ssize_t len1, len2;
10531 PyObject* out;
10532
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010534 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 if (PyUnicode_READY(self) == -1)
10537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010540 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010542 if (PyUnicode_IS_ASCII(self))
10543 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010544 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010545 PyUnicode_GET_LENGTH(self), maxcount
10546 );
10547 else
10548 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010549 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010550 PyUnicode_GET_LENGTH(self), maxcount
10551 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 case PyUnicode_2BYTE_KIND:
10553 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010554 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 PyUnicode_GET_LENGTH(self), maxcount
10556 );
10557 case PyUnicode_4BYTE_KIND:
10558 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010559 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 PyUnicode_GET_LENGTH(self), maxcount
10561 );
10562 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010563 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 }
10565
10566 if (PyUnicode_READY(substring) == -1)
10567 return NULL;
10568
10569 kind1 = PyUnicode_KIND(self);
10570 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 len1 = PyUnicode_GET_LENGTH(self);
10572 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010573 if (kind1 < kind2 || len1 < len2) {
10574 out = PyList_New(1);
10575 if (out == NULL)
10576 return NULL;
10577 Py_INCREF(self);
10578 PyList_SET_ITEM(out, 0, self);
10579 return out;
10580 }
10581 buf1 = PyUnicode_DATA(self);
10582 buf2 = PyUnicode_DATA(substring);
10583 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010584 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010585 if (!buf2)
10586 return NULL;
10587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010589 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010591 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10592 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010593 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010594 else
10595 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010596 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 break;
10598 case PyUnicode_2BYTE_KIND:
10599 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010600 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 break;
10602 case PyUnicode_4BYTE_KIND:
10603 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010604 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 break;
10606 default:
10607 out = NULL;
10608 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010609 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010610 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010611 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613}
10614
Alexander Belopolsky40018472011-02-26 01:02:56 +000010615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010616rsplit(PyObject *self,
10617 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010618 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010619{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010620 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010621 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 Py_ssize_t len1, len2;
10623 PyObject* out;
10624
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010625 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010626 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (PyUnicode_READY(self) == -1)
10629 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010632 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010634 if (PyUnicode_IS_ASCII(self))
10635 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010636 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010637 PyUnicode_GET_LENGTH(self), maxcount
10638 );
10639 else
10640 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010641 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010642 PyUnicode_GET_LENGTH(self), maxcount
10643 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 case PyUnicode_2BYTE_KIND:
10645 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010646 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 PyUnicode_GET_LENGTH(self), maxcount
10648 );
10649 case PyUnicode_4BYTE_KIND:
10650 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010651 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 PyUnicode_GET_LENGTH(self), maxcount
10653 );
10654 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010655 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 }
10657
10658 if (PyUnicode_READY(substring) == -1)
10659 return NULL;
10660
10661 kind1 = PyUnicode_KIND(self);
10662 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 len1 = PyUnicode_GET_LENGTH(self);
10664 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010665 if (kind1 < kind2 || len1 < len2) {
10666 out = PyList_New(1);
10667 if (out == NULL)
10668 return NULL;
10669 Py_INCREF(self);
10670 PyList_SET_ITEM(out, 0, self);
10671 return out;
10672 }
10673 buf1 = PyUnicode_DATA(self);
10674 buf2 = PyUnicode_DATA(substring);
10675 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010676 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010677 if (!buf2)
10678 return NULL;
10679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010681 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010683 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10684 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010685 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010686 else
10687 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010688 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 break;
10690 case PyUnicode_2BYTE_KIND:
10691 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010692 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 break;
10694 case PyUnicode_4BYTE_KIND:
10695 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010696 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 break;
10698 default:
10699 out = NULL;
10700 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010701 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010702 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010703 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 return out;
10705}
10706
10707static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010708anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10709 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010711 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010713 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10714 return asciilib_find(buf1, len1, buf2, len2, offset);
10715 else
10716 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 case PyUnicode_2BYTE_KIND:
10718 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10719 case PyUnicode_4BYTE_KIND:
10720 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10721 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010722 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723}
10724
10725static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010726anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10727 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010729 switch (kind) {
10730 case PyUnicode_1BYTE_KIND:
10731 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10732 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10733 else
10734 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10735 case PyUnicode_2BYTE_KIND:
10736 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10737 case PyUnicode_4BYTE_KIND:
10738 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10739 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010740 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010741}
10742
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010743static void
10744replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10745 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10746{
10747 int kind = PyUnicode_KIND(u);
10748 void *data = PyUnicode_DATA(u);
10749 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10750 if (kind == PyUnicode_1BYTE_KIND) {
10751 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10752 (Py_UCS1 *)data + len,
10753 u1, u2, maxcount);
10754 }
10755 else if (kind == PyUnicode_2BYTE_KIND) {
10756 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10757 (Py_UCS2 *)data + len,
10758 u1, u2, maxcount);
10759 }
10760 else {
10761 assert(kind == PyUnicode_4BYTE_KIND);
10762 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10763 (Py_UCS4 *)data + len,
10764 u1, u2, maxcount);
10765 }
10766}
10767
Alexander Belopolsky40018472011-02-26 01:02:56 +000010768static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769replace(PyObject *self, PyObject *str1,
10770 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010773 const char *sbuf = PyUnicode_DATA(self);
10774 const void *buf1 = PyUnicode_DATA(str1);
10775 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 int srelease = 0, release1 = 0, release2 = 0;
10777 int skind = PyUnicode_KIND(self);
10778 int kind1 = PyUnicode_KIND(str1);
10779 int kind2 = PyUnicode_KIND(str2);
10780 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10781 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10782 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010783 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010784 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010786 if (slen < len1)
10787 goto nothing;
10788
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010791 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010792 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793
Victor Stinner59de0ee2011-10-07 10:01:28 +020010794 if (str1 == str2)
10795 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796
Victor Stinner49a0a212011-10-12 23:46:10 +020010797 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010798 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10799 if (maxchar < maxchar_str1)
10800 /* substring too wide to be present */
10801 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010802 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10803 /* Replacing str1 with str2 may cause a maxchar reduction in the
10804 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010805 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010806 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010809 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010811 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010813 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010814 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010815 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010816
Victor Stinner69ed0f42013-04-09 21:48:24 +020010817 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010818 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010819 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010820 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010821 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010823 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010825
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010826 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10827 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010828 }
10829 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 int rkind = skind;
10831 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010832 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 if (kind1 < rkind) {
10835 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010836 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 if (!buf1) goto error;
10838 release1 = 1;
10839 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010840 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010841 if (i < 0)
10842 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 if (rkind > kind2) {
10844 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010845 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 if (!buf2) goto error;
10847 release2 = 1;
10848 }
10849 else if (rkind < kind2) {
10850 /* widen self and buf1 */
10851 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010852 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010853 assert(buf1 != PyUnicode_DATA(str1));
10854 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010855 buf1 = PyUnicode_DATA(str1);
10856 release1 = 0;
10857 }
10858 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 if (!sbuf) goto error;
10860 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010861 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 if (!buf1) goto error;
10863 release1 = 1;
10864 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010865 u = PyUnicode_New(slen, maxchar);
10866 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010868 assert(PyUnicode_KIND(u) == rkind);
10869 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010870
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010871 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010872 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010873 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010875 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010877
10878 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010879 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010880 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010881 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010882 if (i == -1)
10883 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010884 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010885 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010886 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010890 }
10891 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010893 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 int rkind = skind;
10895 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010898 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010899 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (!buf1) goto error;
10901 release1 = 1;
10902 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010903 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 if (n == 0)
10905 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010907 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010908 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (!buf2) goto error;
10910 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010913 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010915 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (!sbuf) goto error;
10917 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010918 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010919 assert(buf1 != PyUnicode_DATA(str1));
10920 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010921 buf1 = PyUnicode_DATA(str1);
10922 release1 = 0;
10923 }
10924 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (!buf1) goto error;
10926 release1 = 1;
10927 }
10928 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10929 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010930 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 PyErr_SetString(PyExc_OverflowError,
10932 "replace string is too long");
10933 goto error;
10934 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010935 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010936 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010937 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010938 goto done;
10939 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010940 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 PyErr_SetString(PyExc_OverflowError,
10942 "replace string is too long");
10943 goto error;
10944 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010945 u = PyUnicode_New(new_size, maxchar);
10946 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010948 assert(PyUnicode_KIND(u) == rkind);
10949 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 ires = i = 0;
10951 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010952 while (n-- > 0) {
10953 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010954 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010955 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010956 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010957 if (j == -1)
10958 break;
10959 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010960 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010961 memcpy(res + rkind * ires,
10962 sbuf + rkind * i,
10963 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965 }
10966 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010968 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010970 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010976 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010977 memcpy(res + rkind * ires,
10978 sbuf + rkind * i,
10979 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010980 }
10981 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010982 /* interleave */
10983 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010984 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010986 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010988 if (--n <= 0)
10989 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010990 memcpy(res + rkind * ires,
10991 sbuf + rkind * i,
10992 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 ires++;
10994 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010995 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010996 memcpy(res + rkind * ires,
10997 sbuf + rkind * i,
10998 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010999 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011000 }
11001
11002 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011003 unicode_adjust_maxchar(&u);
11004 if (u == NULL)
11005 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011007
11008 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011009 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11010 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11011 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011013 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011015 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011017 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011018 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011020
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011022 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011023 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11024 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11025 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011027 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011029 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011031 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011032 return unicode_result_unchanged(self);
11033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011035 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11036 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11037 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11038 if (srelease)
11039 PyMem_FREE((void *)sbuf);
11040 if (release1)
11041 PyMem_FREE((void *)buf1);
11042 if (release2)
11043 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045}
11046
11047/* --- Unicode Object Methods --------------------------------------------- */
11048
INADA Naoki3ae20562017-01-16 20:41:20 +090011049/*[clinic input]
11050str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
INADA Naoki3ae20562017-01-16 20:41:20 +090011052Return a version of the string where each word is titlecased.
11053
11054More specifically, words start with uppercased characters and all remaining
11055cased characters have lower case.
11056[clinic start generated code]*/
11057
11058static PyObject *
11059unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011060/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011062 if (PyUnicode_READY(self) == -1)
11063 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011064 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065}
11066
INADA Naoki3ae20562017-01-16 20:41:20 +090011067/*[clinic input]
11068str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
INADA Naoki3ae20562017-01-16 20:41:20 +090011070Return a capitalized version of the string.
11071
11072More specifically, make the first character have upper case and the rest lower
11073case.
11074[clinic start generated code]*/
11075
11076static PyObject *
11077unicode_capitalize_impl(PyObject *self)
11078/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011080 if (PyUnicode_READY(self) == -1)
11081 return NULL;
11082 if (PyUnicode_GET_LENGTH(self) == 0)
11083 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011084 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085}
11086
INADA Naoki3ae20562017-01-16 20:41:20 +090011087/*[clinic input]
11088str.casefold as unicode_casefold
11089
11090Return a version of the string suitable for caseless comparisons.
11091[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011092
11093static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011094unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011095/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011096{
11097 if (PyUnicode_READY(self) == -1)
11098 return NULL;
11099 if (PyUnicode_IS_ASCII(self))
11100 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011101 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011102}
11103
11104
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011105/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011106
11107static int
11108convert_uc(PyObject *obj, void *addr)
11109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011111
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011112 if (!PyUnicode_Check(obj)) {
11113 PyErr_Format(PyExc_TypeError,
11114 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011115 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011116 return 0;
11117 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011118 if (PyUnicode_READY(obj) < 0)
11119 return 0;
11120 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011121 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011123 return 0;
11124 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011125 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011126 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011127}
11128
INADA Naoki3ae20562017-01-16 20:41:20 +090011129/*[clinic input]
11130str.center as unicode_center
11131
11132 width: Py_ssize_t
11133 fillchar: Py_UCS4 = ' '
11134 /
11135
11136Return a centered string of length width.
11137
11138Padding is done using the specified fill character (default is a space).
11139[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
11141static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011142unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11143/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011145 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
Benjamin Petersonbac79492012-01-14 13:34:47 -050011147 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 return NULL;
11149
Victor Stinnerc4b49542011-12-11 22:44:26 +010011150 if (PyUnicode_GET_LENGTH(self) >= width)
11151 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152
Victor Stinnerc4b49542011-12-11 22:44:26 +010011153 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 left = marg / 2 + (marg & width & 1);
11155
Victor Stinner9310abb2011-10-05 00:59:23 +020011156 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157}
11158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159/* This function assumes that str1 and str2 are readied by the caller. */
11160
Marc-André Lemburge5034372000-08-08 08:04:29 +000011161static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011162unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011163{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011164#define COMPARE(TYPE1, TYPE2) \
11165 do { \
11166 TYPE1* p1 = (TYPE1 *)data1; \
11167 TYPE2* p2 = (TYPE2 *)data2; \
11168 TYPE1* end = p1 + len; \
11169 Py_UCS4 c1, c2; \
11170 for (; p1 != end; p1++, p2++) { \
11171 c1 = *p1; \
11172 c2 = *p2; \
11173 if (c1 != c2) \
11174 return (c1 < c2) ? -1 : 1; \
11175 } \
11176 } \
11177 while (0)
11178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011180 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011181 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 kind1 = PyUnicode_KIND(str1);
11184 kind2 = PyUnicode_KIND(str2);
11185 data1 = PyUnicode_DATA(str1);
11186 data2 = PyUnicode_DATA(str2);
11187 len1 = PyUnicode_GET_LENGTH(str1);
11188 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011189 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011190
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011191 switch(kind1) {
11192 case PyUnicode_1BYTE_KIND:
11193 {
11194 switch(kind2) {
11195 case PyUnicode_1BYTE_KIND:
11196 {
11197 int cmp = memcmp(data1, data2, len);
11198 /* normalize result of memcmp() into the range [-1; 1] */
11199 if (cmp < 0)
11200 return -1;
11201 if (cmp > 0)
11202 return 1;
11203 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011204 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011205 case PyUnicode_2BYTE_KIND:
11206 COMPARE(Py_UCS1, Py_UCS2);
11207 break;
11208 case PyUnicode_4BYTE_KIND:
11209 COMPARE(Py_UCS1, Py_UCS4);
11210 break;
11211 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011212 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011213 }
11214 break;
11215 }
11216 case PyUnicode_2BYTE_KIND:
11217 {
11218 switch(kind2) {
11219 case PyUnicode_1BYTE_KIND:
11220 COMPARE(Py_UCS2, Py_UCS1);
11221 break;
11222 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011223 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011224 COMPARE(Py_UCS2, Py_UCS2);
11225 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011226 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011227 case PyUnicode_4BYTE_KIND:
11228 COMPARE(Py_UCS2, Py_UCS4);
11229 break;
11230 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011231 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011232 }
11233 break;
11234 }
11235 case PyUnicode_4BYTE_KIND:
11236 {
11237 switch(kind2) {
11238 case PyUnicode_1BYTE_KIND:
11239 COMPARE(Py_UCS4, Py_UCS1);
11240 break;
11241 case PyUnicode_2BYTE_KIND:
11242 COMPARE(Py_UCS4, Py_UCS2);
11243 break;
11244 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011245 {
11246#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11247 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11248 /* normalize result of wmemcmp() into the range [-1; 1] */
11249 if (cmp < 0)
11250 return -1;
11251 if (cmp > 0)
11252 return 1;
11253#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011254 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011255#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011256 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011257 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011258 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011259 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011260 }
11261 break;
11262 }
11263 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011264 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011265 }
11266
Victor Stinner770e19e2012-10-04 22:59:45 +020011267 if (len1 == len2)
11268 return 0;
11269 if (len1 < len2)
11270 return -1;
11271 else
11272 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011273
11274#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011275}
11276
Benjamin Peterson621b4302016-09-09 13:54:34 -070011277static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011278unicode_compare_eq(PyObject *str1, PyObject *str2)
11279{
11280 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011281 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011282 Py_ssize_t len;
11283 int cmp;
11284
Victor Stinnere5567ad2012-10-23 02:48:49 +020011285 len = PyUnicode_GET_LENGTH(str1);
11286 if (PyUnicode_GET_LENGTH(str2) != len)
11287 return 0;
11288 kind = PyUnicode_KIND(str1);
11289 if (PyUnicode_KIND(str2) != kind)
11290 return 0;
11291 data1 = PyUnicode_DATA(str1);
11292 data2 = PyUnicode_DATA(str2);
11293
11294 cmp = memcmp(data1, data2, len * kind);
11295 return (cmp == 0);
11296}
11297
11298
Alexander Belopolsky40018472011-02-26 01:02:56 +000011299int
11300PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11303 if (PyUnicode_READY(left) == -1 ||
11304 PyUnicode_READY(right) == -1)
11305 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011306
11307 /* a string is equal to itself */
11308 if (left == right)
11309 return 0;
11310
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011311 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011313 PyErr_Format(PyExc_TypeError,
11314 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011315 Py_TYPE(left)->tp_name,
11316 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 return -1;
11318}
11319
Martin v. Löwis5b222132007-06-10 09:51:05 +000011320int
11321PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 Py_ssize_t i;
11324 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011326 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327
Victor Stinner910337b2011-10-03 03:20:16 +020011328 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011329 if (!PyUnicode_IS_READY(uni)) {
11330 const wchar_t *ws = _PyUnicode_WSTR(uni);
11331 /* Compare Unicode string and source character set string */
11332 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11333 if (chr != ustr[i])
11334 return (chr < ustr[i]) ? -1 : 1;
11335 }
11336 /* This check keeps Python strings that end in '\0' from comparing equal
11337 to C strings identical up to that point. */
11338 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11339 return 1; /* uni is longer */
11340 if (ustr[i])
11341 return -1; /* str is longer */
11342 return 0;
11343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011345 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011346 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011347 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011348 size_t len, len2 = strlen(str);
11349 int cmp;
11350
11351 len = Py_MIN(len1, len2);
11352 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011353 if (cmp != 0) {
11354 if (cmp < 0)
11355 return -1;
11356 else
11357 return 1;
11358 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011359 if (len1 > len2)
11360 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011361 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011362 return -1; /* str is longer */
11363 return 0;
11364 }
11365 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011366 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011367 /* Compare Unicode string and source character set string */
11368 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011369 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011370 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11371 /* This check keeps Python strings that end in '\0' from comparing equal
11372 to C strings identical up to that point. */
11373 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11374 return 1; /* uni is longer */
11375 if (str[i])
11376 return -1; /* str is longer */
11377 return 0;
11378 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011379}
11380
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011381static int
11382non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11383{
11384 size_t i, len;
11385 const wchar_t *p;
11386 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11387 if (strlen(str) != len)
11388 return 0;
11389 p = _PyUnicode_WSTR(unicode);
11390 assert(p);
11391 for (i = 0; i < len; i++) {
11392 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011393 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011394 return 0;
11395 }
11396 return 1;
11397}
11398
11399int
11400_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11401{
11402 size_t len;
11403 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011404 assert(str);
11405#ifndef NDEBUG
11406 for (const char *p = str; *p; p++) {
11407 assert((unsigned char)*p < 128);
11408 }
11409#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011410 if (PyUnicode_READY(unicode) == -1) {
11411 /* Memory error or bad data */
11412 PyErr_Clear();
11413 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11414 }
11415 if (!PyUnicode_IS_ASCII(unicode))
11416 return 0;
11417 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11418 return strlen(str) == len &&
11419 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11420}
11421
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011422int
11423_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11424{
11425 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011426
11427 assert(_PyUnicode_CHECK(left));
11428 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011429#ifndef NDEBUG
11430 for (const char *p = right->string; *p; p++) {
11431 assert((unsigned char)*p < 128);
11432 }
11433#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011434
11435 if (PyUnicode_READY(left) == -1) {
11436 /* memory error or bad data */
11437 PyErr_Clear();
11438 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11439 }
11440
11441 if (!PyUnicode_IS_ASCII(left))
11442 return 0;
11443
11444 right_uni = _PyUnicode_FromId(right); /* borrowed */
11445 if (right_uni == NULL) {
11446 /* memory error or bad data */
11447 PyErr_Clear();
11448 return _PyUnicode_EqualToASCIIString(left, right->string);
11449 }
11450
11451 if (left == right_uni)
11452 return 1;
11453
11454 if (PyUnicode_CHECK_INTERNED(left))
11455 return 0;
11456
Victor Stinner607b1022020-05-05 18:50:30 +020011457#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011458 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011459 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011460 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11461 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011462#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011463
11464 return unicode_compare_eq(left, right_uni);
11465}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011466
Alexander Belopolsky40018472011-02-26 01:02:56 +000011467PyObject *
11468PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011469{
11470 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011471
Victor Stinnere5567ad2012-10-23 02:48:49 +020011472 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11473 Py_RETURN_NOTIMPLEMENTED;
11474
11475 if (PyUnicode_READY(left) == -1 ||
11476 PyUnicode_READY(right) == -1)
11477 return NULL;
11478
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011479 if (left == right) {
11480 switch (op) {
11481 case Py_EQ:
11482 case Py_LE:
11483 case Py_GE:
11484 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011485 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011486 case Py_NE:
11487 case Py_LT:
11488 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011489 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011490 default:
11491 PyErr_BadArgument();
11492 return NULL;
11493 }
11494 }
11495 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011496 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011497 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011498 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011499 }
11500 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011501 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011502 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011503 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011504}
11505
Alexander Belopolsky40018472011-02-26 01:02:56 +000011506int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011507_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11508{
11509 return unicode_eq(aa, bb);
11510}
11511
11512int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011513PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011514{
Victor Stinner77282cb2013-04-14 19:22:47 +020011515 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011516 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011518 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011519
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011520 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011522 "'in <string>' requires string as left operand, not %.100s",
11523 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011524 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011525 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011526 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011527 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011528 if (ensure_unicode(str) < 0)
11529 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011532 kind2 = PyUnicode_KIND(substr);
11533 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011534 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011536 len2 = PyUnicode_GET_LENGTH(substr);
11537 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011538 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011539 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011540 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011541 if (len2 == 1) {
11542 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11543 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011544 return result;
11545 }
11546 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011547 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011548 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011549 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551
Victor Stinner77282cb2013-04-14 19:22:47 +020011552 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 case PyUnicode_1BYTE_KIND:
11554 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11555 break;
11556 case PyUnicode_2BYTE_KIND:
11557 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11558 break;
11559 case PyUnicode_4BYTE_KIND:
11560 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11561 break;
11562 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011563 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011565
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011566 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011567 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011568 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569
Guido van Rossum403d68b2000-03-13 15:55:09 +000011570 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011571}
11572
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573/* Concat to string or Unicode object giving a new Unicode object. */
11574
Alexander Belopolsky40018472011-02-26 01:02:56 +000011575PyObject *
11576PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011578 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011579 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011580 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011582 if (ensure_unicode(left) < 0)
11583 return NULL;
11584
11585 if (!PyUnicode_Check(right)) {
11586 PyErr_Format(PyExc_TypeError,
11587 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011588 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011589 return NULL;
11590 }
11591 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
11594 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011595 PyObject *empty = unicode_get_empty(); // Borrowed reference
11596 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011597 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011598 }
11599 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011600 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011603 left_len = PyUnicode_GET_LENGTH(left);
11604 right_len = PyUnicode_GET_LENGTH(right);
11605 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011606 PyErr_SetString(PyExc_OverflowError,
11607 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011608 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011609 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011610 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011611
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011612 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11613 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011614 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011617 result = PyUnicode_New(new_len, maxchar);
11618 if (result == NULL)
11619 return NULL;
11620 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11621 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11622 assert(_PyUnicode_CheckConsistency(result, 1));
11623 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624}
11625
Walter Dörwald1ab83302007-05-18 17:15:44 +000011626void
Victor Stinner23e56682011-10-03 03:54:37 +020011627PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011628{
Victor Stinner23e56682011-10-03 03:54:37 +020011629 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011630 Py_UCS4 maxchar, maxchar2;
11631 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011632
11633 if (p_left == NULL) {
11634 if (!PyErr_Occurred())
11635 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011636 return;
11637 }
Victor Stinner23e56682011-10-03 03:54:37 +020011638 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011639 if (right == NULL || left == NULL
11640 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011641 if (!PyErr_Occurred())
11642 PyErr_BadInternalCall();
11643 goto error;
11644 }
11645
Benjamin Petersonbac79492012-01-14 13:34:47 -050011646 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011647 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011648 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011649 goto error;
11650
Victor Stinner488fa492011-12-12 00:01:39 +010011651 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011652 PyObject *empty = unicode_get_empty(); // Borrowed reference
11653 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011654 Py_DECREF(left);
11655 Py_INCREF(right);
11656 *p_left = right;
11657 return;
11658 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011659 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011660 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011661 }
Victor Stinner488fa492011-12-12 00:01:39 +010011662
11663 left_len = PyUnicode_GET_LENGTH(left);
11664 right_len = PyUnicode_GET_LENGTH(right);
11665 if (left_len > PY_SSIZE_T_MAX - right_len) {
11666 PyErr_SetString(PyExc_OverflowError,
11667 "strings are too large to concat");
11668 goto error;
11669 }
11670 new_len = left_len + right_len;
11671
11672 if (unicode_modifiable(left)
11673 && PyUnicode_CheckExact(right)
11674 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011675 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11676 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011677 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011678 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011679 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11680 {
11681 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011682 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011683 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011684
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011685 /* copy 'right' into the newly allocated area of 'left' */
11686 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011687 }
Victor Stinner488fa492011-12-12 00:01:39 +010011688 else {
11689 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11690 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011691 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011692
Victor Stinner488fa492011-12-12 00:01:39 +010011693 /* Concat the two Unicode strings */
11694 res = PyUnicode_New(new_len, maxchar);
11695 if (res == NULL)
11696 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011697 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11698 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011699 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011700 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011701 }
11702 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011703 return;
11704
11705error:
Victor Stinner488fa492011-12-12 00:01:39 +010011706 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011707}
11708
11709void
11710PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11711{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 PyUnicode_Append(pleft, right);
11713 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011714}
11715
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011716/*
11717Wraps stringlib_parse_args_finds() and additionally ensures that the
11718first argument is a unicode object.
11719*/
11720
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011721static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011722parse_args_finds_unicode(const char * function_name, PyObject *args,
11723 PyObject **substring,
11724 Py_ssize_t *start, Py_ssize_t *end)
11725{
11726 if(stringlib_parse_args_finds(function_name, args, substring,
11727 start, end)) {
11728 if (ensure_unicode(*substring) < 0)
11729 return 0;
11730 return 1;
11731 }
11732 return 0;
11733}
11734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011735PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011736 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011738Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011739string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011740interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
11742static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011743unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011745 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011746 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011747 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011749 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011750 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011753 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 kind1 = PyUnicode_KIND(self);
11757 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011758 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011759 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 len1 = PyUnicode_GET_LENGTH(self);
11762 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011764 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011765 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011766
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011767 buf1 = PyUnicode_DATA(self);
11768 buf2 = PyUnicode_DATA(substring);
11769 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011770 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011771 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011772 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011773 }
11774 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 case PyUnicode_1BYTE_KIND:
11776 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011777 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 buf2, len2, PY_SSIZE_T_MAX
11779 );
11780 break;
11781 case PyUnicode_2BYTE_KIND:
11782 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011783 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 buf2, len2, PY_SSIZE_T_MAX
11785 );
11786 break;
11787 case PyUnicode_4BYTE_KIND:
11788 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011789 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 buf2, len2, PY_SSIZE_T_MAX
11791 );
11792 break;
11793 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011794 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 }
11796
11797 result = PyLong_FromSsize_t(iresult);
11798
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011799 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011800 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011801 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 return result;
11804}
11805
INADA Naoki3ae20562017-01-16 20:41:20 +090011806/*[clinic input]
11807str.encode as unicode_encode
11808
11809 encoding: str(c_default="NULL") = 'utf-8'
11810 The encoding in which to encode the string.
11811 errors: str(c_default="NULL") = 'strict'
11812 The error handling scheme to use for encoding errors.
11813 The default is 'strict' meaning that encoding errors raise a
11814 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11815 'xmlcharrefreplace' as well as any other name registered with
11816 codecs.register_error that can handle UnicodeEncodeErrors.
11817
11818Encode the string using the codec registered for encoding.
11819[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
11821static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011822unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011823/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011825 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011826}
11827
INADA Naoki3ae20562017-01-16 20:41:20 +090011828/*[clinic input]
11829str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
INADA Naoki3ae20562017-01-16 20:41:20 +090011831 tabsize: int = 8
11832
11833Return a copy where all tab characters are expanded using spaces.
11834
11835If tabsize is not given, a tab size of 8 characters is assumed.
11836[clinic start generated code]*/
11837
11838static PyObject *
11839unicode_expandtabs_impl(PyObject *self, int tabsize)
11840/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011842 Py_ssize_t i, j, line_pos, src_len, incr;
11843 Py_UCS4 ch;
11844 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011845 const void *src_data;
11846 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011847 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011848 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Antoine Pitrou22425222011-10-04 19:10:51 +020011850 if (PyUnicode_READY(self) == -1)
11851 return NULL;
11852
Thomas Wouters7e474022000-07-16 12:04:32 +000011853 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011854 src_len = PyUnicode_GET_LENGTH(self);
11855 i = j = line_pos = 0;
11856 kind = PyUnicode_KIND(self);
11857 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011858 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011859 for (; i < src_len; i++) {
11860 ch = PyUnicode_READ(kind, src_data, i);
11861 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011862 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011864 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011866 goto overflow;
11867 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011869 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011873 goto overflow;
11874 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011876 if (ch == '\n' || ch == '\r')
11877 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011879 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011880 if (!found)
11881 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011882
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011884 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 if (!u)
11886 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011887 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
Antoine Pitroue71d5742011-10-04 15:55:09 +020011889 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Antoine Pitroue71d5742011-10-04 15:55:09 +020011891 for (; i < src_len; i++) {
11892 ch = PyUnicode_READ(kind, src_data, i);
11893 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011894 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011895 incr = tabsize - (line_pos % tabsize);
11896 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011897 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011898 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011900 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011902 line_pos++;
11903 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011904 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011905 if (ch == '\n' || ch == '\r')
11906 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011908 }
11909 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011910 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011911
Antoine Pitroue71d5742011-10-04 15:55:09 +020011912 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011913 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011917PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919\n\
11920Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011921such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922arguments start and end are interpreted as in slice notation.\n\
11923\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011924Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925
11926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011929 /* initialize variables to prevent gcc warning */
11930 PyObject *substring = NULL;
11931 Py_ssize_t start = 0;
11932 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011933 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011935 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011938 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011941 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (result == -2)
11944 return NULL;
11945
Christian Heimes217cfd12007-12-02 14:31:20 +000011946 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947}
11948
11949static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011950unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011952 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011953 enum PyUnicode_Kind kind;
11954 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011955
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011956 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011957 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011959 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011960 if (PyUnicode_READY(self) == -1) {
11961 return NULL;
11962 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011963 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11964 PyErr_SetString(PyExc_IndexError, "string index out of range");
11965 return NULL;
11966 }
11967 kind = PyUnicode_KIND(self);
11968 data = PyUnicode_DATA(self);
11969 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011970 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971}
11972
Guido van Rossumc2504932007-09-18 19:42:40 +000011973/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011974 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011975static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011976unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011978 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011979
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011980#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011981 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011982#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (_PyUnicode_HASH(self) != -1)
11984 return _PyUnicode_HASH(self);
11985 if (PyUnicode_READY(self) == -1)
11986 return -1;
animalizea1d14252019-01-02 20:16:06 +080011987
Christian Heimes985ecdc2013-11-20 11:46:18 +010011988 x = _Py_HashBytes(PyUnicode_DATA(self),
11989 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011991 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992}
11993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011994PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996\n\
oldkaa0735f2018-02-02 16:52:55 +080011997Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011998such that sub is contained within S[start:end]. Optional\n\
11999arguments start and end are interpreted as in slice notation.\n\
12000\n\
12001Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
12003static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012006 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012007 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012008 PyObject *substring = NULL;
12009 Py_ssize_t start = 0;
12010 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012012 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012015 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012018 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 if (result == -2)
12021 return NULL;
12022
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023 if (result < 0) {
12024 PyErr_SetString(PyExc_ValueError, "substring not found");
12025 return NULL;
12026 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012027
Christian Heimes217cfd12007-12-02 14:31:20 +000012028 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029}
12030
INADA Naoki3ae20562017-01-16 20:41:20 +090012031/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012032str.isascii as unicode_isascii
12033
12034Return True if all characters in the string are ASCII, False otherwise.
12035
12036ASCII characters have code points in the range U+0000-U+007F.
12037Empty string is ASCII too.
12038[clinic start generated code]*/
12039
12040static PyObject *
12041unicode_isascii_impl(PyObject *self)
12042/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12043{
12044 if (PyUnicode_READY(self) == -1) {
12045 return NULL;
12046 }
12047 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12048}
12049
12050/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012051str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
INADA Naoki3ae20562017-01-16 20:41:20 +090012053Return True if the string is a lowercase string, False otherwise.
12054
12055A string is lowercase if all cased characters in the string are lowercase and
12056there is at least one cased character in the string.
12057[clinic start generated code]*/
12058
12059static PyObject *
12060unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012061/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 Py_ssize_t i, length;
12064 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012065 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066 int cased;
12067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 if (PyUnicode_READY(self) == -1)
12069 return NULL;
12070 length = PyUnicode_GET_LENGTH(self);
12071 kind = PyUnicode_KIND(self);
12072 data = PyUnicode_DATA(self);
12073
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (length == 1)
12076 return PyBool_FromLong(
12077 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012079 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012081 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012082
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 for (i = 0; i < length; i++) {
12085 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012086
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012088 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012089 else if (!cased && Py_UNICODE_ISLOWER(ch))
12090 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012092 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093}
12094
INADA Naoki3ae20562017-01-16 20:41:20 +090012095/*[clinic input]
12096str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
INADA Naoki3ae20562017-01-16 20:41:20 +090012098Return True if the string is an uppercase string, False otherwise.
12099
12100A string is uppercase if all cased characters in the string are uppercase and
12101there is at least one cased character in the string.
12102[clinic start generated code]*/
12103
12104static PyObject *
12105unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012106/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 Py_ssize_t i, length;
12109 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012110 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 int cased;
12112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 if (PyUnicode_READY(self) == -1)
12114 return NULL;
12115 length = PyUnicode_GET_LENGTH(self);
12116 kind = PyUnicode_KIND(self);
12117 data = PyUnicode_DATA(self);
12118
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 if (length == 1)
12121 return PyBool_FromLong(
12122 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012124 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012126 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012127
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 for (i = 0; i < length; i++) {
12130 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012131
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012133 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 else if (!cased && Py_UNICODE_ISUPPER(ch))
12135 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012137 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138}
12139
INADA Naoki3ae20562017-01-16 20:41:20 +090012140/*[clinic input]
12141str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142
INADA Naoki3ae20562017-01-16 20:41:20 +090012143Return True if the string is a title-cased string, False otherwise.
12144
12145In a title-cased string, upper- and title-case characters may only
12146follow uncased characters and lowercase characters only cased ones.
12147[clinic start generated code]*/
12148
12149static PyObject *
12150unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012151/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 Py_ssize_t i, length;
12154 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012155 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 int cased, previous_is_cased;
12157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 if (PyUnicode_READY(self) == -1)
12159 return NULL;
12160 length = PyUnicode_GET_LENGTH(self);
12161 kind = PyUnicode_KIND(self);
12162 data = PyUnicode_DATA(self);
12163
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 if (length == 1) {
12166 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12167 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12168 (Py_UNICODE_ISUPPER(ch) != 0));
12169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012171 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012173 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012174
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 cased = 0;
12176 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 for (i = 0; i < length; i++) {
12178 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012179
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12181 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012182 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012183 previous_is_cased = 1;
12184 cased = 1;
12185 }
12186 else if (Py_UNICODE_ISLOWER(ch)) {
12187 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012188 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012189 previous_is_cased = 1;
12190 cased = 1;
12191 }
12192 else
12193 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012195 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196}
12197
INADA Naoki3ae20562017-01-16 20:41:20 +090012198/*[clinic input]
12199str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
INADA Naoki3ae20562017-01-16 20:41:20 +090012201Return True if the string is a whitespace string, False otherwise.
12202
12203A string is whitespace if all characters in the string are whitespace and there
12204is at least one character in the string.
12205[clinic start generated code]*/
12206
12207static PyObject *
12208unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012209/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 Py_ssize_t i, length;
12212 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012213 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214
12215 if (PyUnicode_READY(self) == -1)
12216 return NULL;
12217 length = PyUnicode_GET_LENGTH(self);
12218 kind = PyUnicode_KIND(self);
12219 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 if (length == 1)
12223 return PyBool_FromLong(
12224 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012226 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012228 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 for (i = 0; i < length; i++) {
12231 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012232 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012233 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012235 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236}
12237
INADA Naoki3ae20562017-01-16 20:41:20 +090012238/*[clinic input]
12239str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012240
INADA Naoki3ae20562017-01-16 20:41:20 +090012241Return True if the string is an alphabetic string, False otherwise.
12242
12243A string is alphabetic if all characters in the string are alphabetic and there
12244is at least one character in the string.
12245[clinic start generated code]*/
12246
12247static PyObject *
12248unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012249/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 Py_ssize_t i, length;
12252 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012253 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254
12255 if (PyUnicode_READY(self) == -1)
12256 return NULL;
12257 length = PyUnicode_GET_LENGTH(self);
12258 kind = PyUnicode_KIND(self);
12259 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012260
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012261 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 if (length == 1)
12263 return PyBool_FromLong(
12264 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012265
12266 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012268 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 for (i = 0; i < length; i++) {
12271 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012273 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012274 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012275}
12276
INADA Naoki3ae20562017-01-16 20:41:20 +090012277/*[clinic input]
12278str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012279
INADA Naoki3ae20562017-01-16 20:41:20 +090012280Return True if the string is an alpha-numeric string, False otherwise.
12281
12282A string is alpha-numeric if all characters in the string are alpha-numeric and
12283there is at least one character in the string.
12284[clinic start generated code]*/
12285
12286static PyObject *
12287unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012288/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012291 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 Py_ssize_t len, i;
12293
12294 if (PyUnicode_READY(self) == -1)
12295 return NULL;
12296
12297 kind = PyUnicode_KIND(self);
12298 data = PyUnicode_DATA(self);
12299 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012300
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012301 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (len == 1) {
12303 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12304 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12305 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012306
12307 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012309 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 for (i = 0; i < len; i++) {
12312 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012313 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012314 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012315 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012316 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012317}
12318
INADA Naoki3ae20562017-01-16 20:41:20 +090012319/*[clinic input]
12320str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321
INADA Naoki3ae20562017-01-16 20:41:20 +090012322Return True if the string is a decimal string, False otherwise.
12323
12324A string is a decimal string if all characters in the string are decimal and
12325there is at least one character in the string.
12326[clinic start generated code]*/
12327
12328static PyObject *
12329unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012330/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 Py_ssize_t i, length;
12333 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012334 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335
12336 if (PyUnicode_READY(self) == -1)
12337 return NULL;
12338 length = PyUnicode_GET_LENGTH(self);
12339 kind = PyUnicode_KIND(self);
12340 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 if (length == 1)
12344 return PyBool_FromLong(
12345 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012347 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012349 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 for (i = 0; i < length; i++) {
12352 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012353 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012355 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356}
12357
INADA Naoki3ae20562017-01-16 20:41:20 +090012358/*[clinic input]
12359str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360
INADA Naoki3ae20562017-01-16 20:41:20 +090012361Return True if the string is a digit string, False otherwise.
12362
12363A string is a digit string if all characters in the string are digits and there
12364is at least one character in the string.
12365[clinic start generated code]*/
12366
12367static PyObject *
12368unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012369/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 Py_ssize_t i, length;
12372 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012373 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374
12375 if (PyUnicode_READY(self) == -1)
12376 return NULL;
12377 length = PyUnicode_GET_LENGTH(self);
12378 kind = PyUnicode_KIND(self);
12379 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 if (length == 1) {
12383 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12384 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012387 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012389 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 for (i = 0; i < length; i++) {
12392 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012393 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012395 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396}
12397
INADA Naoki3ae20562017-01-16 20:41:20 +090012398/*[clinic input]
12399str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400
INADA Naoki3ae20562017-01-16 20:41:20 +090012401Return True if the string is a numeric string, False otherwise.
12402
12403A string is numeric if all characters in the string are numeric and there is at
12404least one character in the string.
12405[clinic start generated code]*/
12406
12407static PyObject *
12408unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012409/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 Py_ssize_t i, length;
12412 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012413 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414
12415 if (PyUnicode_READY(self) == -1)
12416 return NULL;
12417 length = PyUnicode_GET_LENGTH(self);
12418 kind = PyUnicode_KIND(self);
12419 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 if (length == 1)
12423 return PyBool_FromLong(
12424 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012426 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012428 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 for (i = 0; i < length; i++) {
12431 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012432 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012433 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012434 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435}
12436
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012437Py_ssize_t
12438_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012441 if (PyUnicode_READY(self) == -1)
12442 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012443
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012444 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012445 if (len == 0) {
12446 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 }
12449
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012450 int kind = PyUnicode_KIND(self);
12451 const void *data = PyUnicode_DATA(self);
12452 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012453 /* PEP 3131 says that the first character must be in
12454 XID_Start and subsequent characters in XID_Continue,
12455 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012457 letters, digits, underscore). However, given the current
12458 definition of XID_Start and XID_Continue, it is sufficient
12459 to check just for these, except that _ must be allowed
12460 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012461 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012462 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012463 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012464
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012465 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012466 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012467 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012468 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012469 }
12470 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012471 return i;
12472}
12473
12474int
12475PyUnicode_IsIdentifier(PyObject *self)
12476{
12477 if (PyUnicode_IS_READY(self)) {
12478 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12479 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12480 /* an empty string is not a valid identifier */
12481 return len && i == len;
12482 }
12483 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012484_Py_COMP_DIAG_PUSH
12485_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012486 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012487 if (len == 0) {
12488 /* an empty string is not a valid identifier */
12489 return 0;
12490 }
12491
12492 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012493 Py_UCS4 ch = wstr[i++];
12494#if SIZEOF_WCHAR_T == 2
12495 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12496 && i < len
12497 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12498 {
12499 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12500 i++;
12501 }
12502#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012503 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12504 return 0;
12505 }
12506
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012507 while (i < len) {
12508 ch = wstr[i++];
12509#if SIZEOF_WCHAR_T == 2
12510 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12511 && i < len
12512 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12513 {
12514 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12515 i++;
12516 }
12517#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012518 if (!_PyUnicode_IsXidContinue(ch)) {
12519 return 0;
12520 }
12521 }
12522 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012523_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012524 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012525}
12526
INADA Naoki3ae20562017-01-16 20:41:20 +090012527/*[clinic input]
12528str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012529
INADA Naoki3ae20562017-01-16 20:41:20 +090012530Return True if the string is a valid Python identifier, False otherwise.
12531
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012532Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012533such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012534[clinic start generated code]*/
12535
12536static PyObject *
12537unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012538/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012539{
12540 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12541}
12542
INADA Naoki3ae20562017-01-16 20:41:20 +090012543/*[clinic input]
12544str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012545
INADA Naoki3ae20562017-01-16 20:41:20 +090012546Return True if the string is printable, False otherwise.
12547
12548A string is printable if all of its characters are considered printable in
12549repr() or if it is empty.
12550[clinic start generated code]*/
12551
12552static PyObject *
12553unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012554/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 Py_ssize_t i, length;
12557 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012558 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559
12560 if (PyUnicode_READY(self) == -1)
12561 return NULL;
12562 length = PyUnicode_GET_LENGTH(self);
12563 kind = PyUnicode_KIND(self);
12564 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012565
12566 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 if (length == 1)
12568 return PyBool_FromLong(
12569 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 for (i = 0; i < length; i++) {
12572 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012573 Py_RETURN_FALSE;
12574 }
12575 }
12576 Py_RETURN_TRUE;
12577}
12578
INADA Naoki3ae20562017-01-16 20:41:20 +090012579/*[clinic input]
12580str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581
INADA Naoki3ae20562017-01-16 20:41:20 +090012582 iterable: object
12583 /
12584
12585Concatenate any number of strings.
12586
Martin Panter91a88662017-01-24 00:30:06 +000012587The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012588The result is returned as a new string.
12589
12590Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12591[clinic start generated code]*/
12592
12593static PyObject *
12594unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012595/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596{
INADA Naoki3ae20562017-01-16 20:41:20 +090012597 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598}
12599
Martin v. Löwis18e16552006-02-15 17:27:45 +000012600static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012601unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 if (PyUnicode_READY(self) == -1)
12604 return -1;
12605 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606}
12607
INADA Naoki3ae20562017-01-16 20:41:20 +090012608/*[clinic input]
12609str.ljust as unicode_ljust
12610
12611 width: Py_ssize_t
12612 fillchar: Py_UCS4 = ' '
12613 /
12614
12615Return a left-justified string of length width.
12616
12617Padding is done using the specified fill character (default is a space).
12618[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619
12620static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012621unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12622/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012624 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
Victor Stinnerc4b49542011-12-11 22:44:26 +010012627 if (PyUnicode_GET_LENGTH(self) >= width)
12628 return unicode_result_unchanged(self);
12629
12630 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631}
12632
INADA Naoki3ae20562017-01-16 20:41:20 +090012633/*[clinic input]
12634str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635
INADA Naoki3ae20562017-01-16 20:41:20 +090012636Return a copy of the string converted to lowercase.
12637[clinic start generated code]*/
12638
12639static PyObject *
12640unicode_lower_impl(PyObject *self)
12641/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012643 if (PyUnicode_READY(self) == -1)
12644 return NULL;
12645 if (PyUnicode_IS_ASCII(self))
12646 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012647 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648}
12649
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012650#define LEFTSTRIP 0
12651#define RIGHTSTRIP 1
12652#define BOTHSTRIP 2
12653
12654/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012655static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012656
INADA Naoki3ae20562017-01-16 20:41:20 +090012657#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012658
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012659/* externally visible for str.strip(unicode) */
12660PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012661_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012662{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012663 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 int kind;
12665 Py_ssize_t i, j, len;
12666 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012667 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12670 return NULL;
12671
12672 kind = PyUnicode_KIND(self);
12673 data = PyUnicode_DATA(self);
12674 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012675 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12677 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012678 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012679
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 i = 0;
12681 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012682 while (i < len) {
12683 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12684 if (!BLOOM(sepmask, ch))
12685 break;
12686 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12687 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012688 i++;
12689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012690 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012691
Benjamin Peterson14339b62009-01-31 16:36:08 +000012692 j = len;
12693 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012694 j--;
12695 while (j >= i) {
12696 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12697 if (!BLOOM(sepmask, ch))
12698 break;
12699 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12700 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012702 }
12703
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012705 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012706
Victor Stinner7931d9a2011-11-04 00:22:48 +010012707 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708}
12709
12710PyObject*
12711PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12712{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012713 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012715 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716
Victor Stinnerde636f32011-10-01 03:55:54 +020012717 if (PyUnicode_READY(self) == -1)
12718 return NULL;
12719
Victor Stinner684d5fd2012-05-03 02:32:34 +020012720 length = PyUnicode_GET_LENGTH(self);
12721 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012722
Victor Stinner684d5fd2012-05-03 02:32:34 +020012723 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012724 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725
Victor Stinnerde636f32011-10-01 03:55:54 +020012726 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012727 PyErr_SetString(PyExc_IndexError, "string index out of range");
12728 return NULL;
12729 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012730 if (start >= length || end < start)
12731 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012732
Victor Stinner684d5fd2012-05-03 02:32:34 +020012733 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012734 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012735 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012736 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012737 }
12738 else {
12739 kind = PyUnicode_KIND(self);
12740 data = PyUnicode_1BYTE_DATA(self);
12741 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012742 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012743 length);
12744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
12747static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012748do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 Py_ssize_t len, i, j;
12751
12752 if (PyUnicode_READY(self) == -1)
12753 return NULL;
12754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012756
Victor Stinnercc7af722013-04-09 22:39:24 +020012757 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012758 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012759
12760 i = 0;
12761 if (striptype != RIGHTSTRIP) {
12762 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012763 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012764 if (!_Py_ascii_whitespace[ch])
12765 break;
12766 i++;
12767 }
12768 }
12769
12770 j = len;
12771 if (striptype != LEFTSTRIP) {
12772 j--;
12773 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012774 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012775 if (!_Py_ascii_whitespace[ch])
12776 break;
12777 j--;
12778 }
12779 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012780 }
12781 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012782 else {
12783 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012784 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012785
Victor Stinnercc7af722013-04-09 22:39:24 +020012786 i = 0;
12787 if (striptype != RIGHTSTRIP) {
12788 while (i < len) {
12789 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12790 if (!Py_UNICODE_ISSPACE(ch))
12791 break;
12792 i++;
12793 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012794 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012795
12796 j = len;
12797 if (striptype != LEFTSTRIP) {
12798 j--;
12799 while (j >= i) {
12800 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12801 if (!Py_UNICODE_ISSPACE(ch))
12802 break;
12803 j--;
12804 }
12805 j++;
12806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012807 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012808
Victor Stinner7931d9a2011-11-04 00:22:48 +010012809 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810}
12811
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012812
12813static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012814do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012815{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012816 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817 if (PyUnicode_Check(sep))
12818 return _PyUnicode_XStrip(self, striptype, sep);
12819 else {
12820 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 "%s arg must be None or str",
12822 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012823 return NULL;
12824 }
12825 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012826
Benjamin Peterson14339b62009-01-31 16:36:08 +000012827 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012828}
12829
12830
INADA Naoki3ae20562017-01-16 20:41:20 +090012831/*[clinic input]
12832str.strip as unicode_strip
12833
12834 chars: object = None
12835 /
12836
Zachary Ware09895c22019-10-09 16:09:00 -050012837Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012838
12839If chars is given and not None, remove characters in chars instead.
12840[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012841
12842static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012843unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012844/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012845{
INADA Naoki3ae20562017-01-16 20:41:20 +090012846 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012847}
12848
12849
INADA Naoki3ae20562017-01-16 20:41:20 +090012850/*[clinic input]
12851str.lstrip as unicode_lstrip
12852
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012853 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012854 /
12855
12856Return a copy of the string with leading whitespace removed.
12857
12858If chars is given and not None, remove characters in chars instead.
12859[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012860
12861static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012862unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012863/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012864{
INADA Naoki3ae20562017-01-16 20:41:20 +090012865 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012866}
12867
12868
INADA Naoki3ae20562017-01-16 20:41:20 +090012869/*[clinic input]
12870str.rstrip as unicode_rstrip
12871
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012872 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012873 /
12874
12875Return a copy of the string with trailing whitespace removed.
12876
12877If chars is given and not None, remove characters in chars instead.
12878[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012879
12880static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012881unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012882/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012883{
INADA Naoki3ae20562017-01-16 20:41:20 +090012884 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012885}
12886
12887
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012889unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012891 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893
Serhiy Storchaka05997252013-01-26 12:14:02 +020012894 if (len < 1)
12895 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896
Victor Stinnerc4b49542011-12-11 22:44:26 +010012897 /* no repeat, return original string */
12898 if (len == 1)
12899 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012900
Benjamin Petersonbac79492012-01-14 13:34:47 -050012901 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902 return NULL;
12903
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012904 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012905 PyErr_SetString(PyExc_OverflowError,
12906 "repeated string is too long");
12907 return NULL;
12908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012910
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012911 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912 if (!u)
12913 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012914 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012917 int kind = PyUnicode_KIND(str);
12918 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012919 if (kind == PyUnicode_1BYTE_KIND) {
12920 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012921 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012922 }
12923 else if (kind == PyUnicode_2BYTE_KIND) {
12924 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012925 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012926 ucs2[n] = fill_char;
12927 } else {
12928 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12929 assert(kind == PyUnicode_4BYTE_KIND);
12930 for (n = 0; n < len; ++n)
12931 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 }
12934 else {
12935 /* number of characters copied this far */
12936 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012937 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012939 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012941 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012943 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012944 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946 }
12947
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012948 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012949 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950}
12951
Alexander Belopolsky40018472011-02-26 01:02:56 +000012952PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012953PyUnicode_Replace(PyObject *str,
12954 PyObject *substr,
12955 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012956 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012958 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12959 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012961 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962}
12963
INADA Naoki3ae20562017-01-16 20:41:20 +090012964/*[clinic input]
12965str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012966
INADA Naoki3ae20562017-01-16 20:41:20 +090012967 old: unicode
12968 new: unicode
12969 count: Py_ssize_t = -1
12970 Maximum number of occurrences to replace.
12971 -1 (the default value) means replace all occurrences.
12972 /
12973
12974Return a copy with all occurrences of substring old replaced by new.
12975
12976If the optional argument count is given, only the first count occurrences are
12977replaced.
12978[clinic start generated code]*/
12979
12980static PyObject *
12981unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12982 Py_ssize_t count)
12983/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012985 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012987 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988}
12989
sweeneydea81849b2020-04-22 17:05:48 -040012990/*[clinic input]
12991str.removeprefix as unicode_removeprefix
12992
12993 prefix: unicode
12994 /
12995
12996Return a str with the given prefix string removed if present.
12997
12998If the string starts with the prefix string, return string[len(prefix):].
12999Otherwise, return a copy of the original string.
13000[clinic start generated code]*/
13001
13002static PyObject *
13003unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13004/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13005{
13006 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13007 if (match == -1) {
13008 return NULL;
13009 }
13010 if (match) {
13011 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13012 PyUnicode_GET_LENGTH(self));
13013 }
13014 return unicode_result_unchanged(self);
13015}
13016
13017/*[clinic input]
13018str.removesuffix as unicode_removesuffix
13019
13020 suffix: unicode
13021 /
13022
13023Return a str with the given suffix string removed if present.
13024
13025If the string ends with the suffix string and that suffix is not empty,
13026return string[:-len(suffix)]. Otherwise, return a copy of the original
13027string.
13028[clinic start generated code]*/
13029
13030static PyObject *
13031unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13032/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13033{
13034 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13035 if (match == -1) {
13036 return NULL;
13037 }
13038 if (match) {
13039 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13040 - PyUnicode_GET_LENGTH(suffix));
13041 }
13042 return unicode_result_unchanged(self);
13043}
13044
Alexander Belopolsky40018472011-02-26 01:02:56 +000013045static PyObject *
13046unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013048 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049 Py_ssize_t isize;
13050 Py_ssize_t osize, squote, dquote, i, o;
13051 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013052 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013053 const void *idata;
13054 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013057 return NULL;
13058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 isize = PyUnicode_GET_LENGTH(unicode);
13060 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 /* Compute length of output, quote characters, and
13063 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013064 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 max = 127;
13066 squote = dquote = 0;
13067 ikind = PyUnicode_KIND(unicode);
13068 for (i = 0; i < isize; i++) {
13069 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013070 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013072 case '\'': squote++; break;
13073 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013075 incr = 2;
13076 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 default:
13078 /* Fast-path ASCII */
13079 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013080 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013082 ;
13083 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013084 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013086 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013088 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013089 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013090 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013092 if (osize > PY_SSIZE_T_MAX - incr) {
13093 PyErr_SetString(PyExc_OverflowError,
13094 "string is too long to generate repr");
13095 return NULL;
13096 }
13097 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 }
13099
13100 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013101 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013103 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 if (dquote)
13105 /* Both squote and dquote present. Use squote,
13106 and escape them */
13107 osize += squote;
13108 else
13109 quote = '"';
13110 }
Victor Stinner55c08782013-04-14 18:45:39 +020013111 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112
13113 repr = PyUnicode_New(osize, max);
13114 if (repr == NULL)
13115 return NULL;
13116 okind = PyUnicode_KIND(repr);
13117 odata = PyUnicode_DATA(repr);
13118
13119 PyUnicode_WRITE(okind, odata, 0, quote);
13120 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013121 if (unchanged) {
13122 _PyUnicode_FastCopyCharacters(repr, 1,
13123 unicode, 0,
13124 isize);
13125 }
13126 else {
13127 for (i = 0, o = 1; i < isize; i++) {
13128 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129
Victor Stinner55c08782013-04-14 18:45:39 +020013130 /* Escape quotes and backslashes */
13131 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013132 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013134 continue;
13135 }
13136
13137 /* Map special whitespace to '\t', \n', '\r' */
13138 if (ch == '\t') {
13139 PyUnicode_WRITE(okind, odata, o++, '\\');
13140 PyUnicode_WRITE(okind, odata, o++, 't');
13141 }
13142 else if (ch == '\n') {
13143 PyUnicode_WRITE(okind, odata, o++, '\\');
13144 PyUnicode_WRITE(okind, odata, o++, 'n');
13145 }
13146 else if (ch == '\r') {
13147 PyUnicode_WRITE(okind, odata, o++, '\\');
13148 PyUnicode_WRITE(okind, odata, o++, 'r');
13149 }
13150
13151 /* Map non-printable US ASCII to '\xhh' */
13152 else if (ch < ' ' || ch == 0x7F) {
13153 PyUnicode_WRITE(okind, odata, o++, '\\');
13154 PyUnicode_WRITE(okind, odata, o++, 'x');
13155 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13156 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13157 }
13158
13159 /* Copy ASCII characters as-is */
13160 else if (ch < 0x7F) {
13161 PyUnicode_WRITE(okind, odata, o++, ch);
13162 }
13163
13164 /* Non-ASCII characters */
13165 else {
13166 /* Map Unicode whitespace and control characters
13167 (categories Z* and C* except ASCII space)
13168 */
13169 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13170 PyUnicode_WRITE(okind, odata, o++, '\\');
13171 /* Map 8-bit characters to '\xhh' */
13172 if (ch <= 0xff) {
13173 PyUnicode_WRITE(okind, odata, o++, 'x');
13174 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13175 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13176 }
13177 /* Map 16-bit characters to '\uxxxx' */
13178 else if (ch <= 0xffff) {
13179 PyUnicode_WRITE(okind, odata, o++, 'u');
13180 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13181 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13182 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13183 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13184 }
13185 /* Map 21-bit characters to '\U00xxxxxx' */
13186 else {
13187 PyUnicode_WRITE(okind, odata, o++, 'U');
13188 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13189 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13190 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13191 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13192 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13193 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13194 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13195 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13196 }
13197 }
13198 /* Copy characters as-is */
13199 else {
13200 PyUnicode_WRITE(okind, odata, o++, ch);
13201 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013202 }
13203 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013206 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013207 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208}
13209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013210PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013211 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212\n\
13213Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013214such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215arguments start and end are interpreted as in slice notation.\n\
13216\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013217Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218
13219static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013222 /* initialize variables to prevent gcc warning */
13223 PyObject *substring = NULL;
13224 Py_ssize_t start = 0;
13225 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013226 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013228 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013230
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013231 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013234 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 if (result == -2)
13237 return NULL;
13238
Christian Heimes217cfd12007-12-02 14:31:20 +000013239 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240}
13241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013242PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013245Return the highest index in S where substring sub is found,\n\
13246such that sub is contained within S[start:end]. Optional\n\
13247arguments start and end are interpreted as in slice notation.\n\
13248\n\
13249Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250
13251static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013254 /* initialize variables to prevent gcc warning */
13255 PyObject *substring = NULL;
13256 Py_ssize_t start = 0;
13257 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013258 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013260 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013263 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013266 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 if (result == -2)
13269 return NULL;
13270
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271 if (result < 0) {
13272 PyErr_SetString(PyExc_ValueError, "substring not found");
13273 return NULL;
13274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275
Christian Heimes217cfd12007-12-02 14:31:20 +000013276 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277}
13278
INADA Naoki3ae20562017-01-16 20:41:20 +090013279/*[clinic input]
13280str.rjust as unicode_rjust
13281
13282 width: Py_ssize_t
13283 fillchar: Py_UCS4 = ' '
13284 /
13285
13286Return a right-justified string of length width.
13287
13288Padding is done using the specified fill character (default is a space).
13289[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290
13291static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013292unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13293/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013295 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296 return NULL;
13297
Victor Stinnerc4b49542011-12-11 22:44:26 +010013298 if (PyUnicode_GET_LENGTH(self) >= width)
13299 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300
Victor Stinnerc4b49542011-12-11 22:44:26 +010013301 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302}
13303
Alexander Belopolsky40018472011-02-26 01:02:56 +000013304PyObject *
13305PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013307 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013310 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311}
13312
INADA Naoki3ae20562017-01-16 20:41:20 +090013313/*[clinic input]
13314str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315
INADA Naoki3ae20562017-01-16 20:41:20 +090013316 sep: object = None
13317 The delimiter according which to split the string.
13318 None (the default value) means split according to any whitespace,
13319 and discard empty strings from the result.
13320 maxsplit: Py_ssize_t = -1
13321 Maximum number of splits to do.
13322 -1 (the default value) means no limit.
13323
13324Return a list of the words in the string, using sep as the delimiter string.
13325[clinic start generated code]*/
13326
13327static PyObject *
13328unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13329/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330{
INADA Naoki3ae20562017-01-16 20:41:20 +090013331 if (sep == Py_None)
13332 return split(self, NULL, maxsplit);
13333 if (PyUnicode_Check(sep))
13334 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013335
Victor Stinner998b8062018-09-12 00:23:25 +020013336 PyErr_Format(PyExc_TypeError,
13337 "must be str or None, not %.100s",
13338 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340}
13341
Thomas Wouters477c8d52006-05-27 19:21:47 +000013342PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013343PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013344{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013345 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013346 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013347 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013349
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013350 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013352
Victor Stinner14f8f022011-10-05 20:58:25 +020013353 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013354 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 len1 = PyUnicode_GET_LENGTH(str_obj);
13356 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013357 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013358 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013359 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013360 }
13361 buf1 = PyUnicode_DATA(str_obj);
13362 buf2 = PyUnicode_DATA(sep_obj);
13363 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013364 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013365 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013366 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013369 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013370 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013371 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13372 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13373 else
13374 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013375 break;
13376 case PyUnicode_2BYTE_KIND:
13377 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13378 break;
13379 case PyUnicode_4BYTE_KIND:
13380 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13381 break;
13382 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013383 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013384 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013385
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013386 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013387 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013388 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013389
13390 return out;
13391}
13392
13393
13394PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013395PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013396{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013397 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013398 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013399 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013401
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013403 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013404
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013405 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013406 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 len1 = PyUnicode_GET_LENGTH(str_obj);
13408 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013409 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013410 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013411 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013412 }
13413 buf1 = PyUnicode_DATA(str_obj);
13414 buf2 = PyUnicode_DATA(sep_obj);
13415 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013416 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013417 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013418 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013421 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013422 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013423 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13424 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13425 else
13426 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 break;
13428 case PyUnicode_2BYTE_KIND:
13429 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13430 break;
13431 case PyUnicode_4BYTE_KIND:
13432 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13433 break;
13434 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013435 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013437
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013438 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013439 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013440 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013441
13442 return out;
13443}
13444
INADA Naoki3ae20562017-01-16 20:41:20 +090013445/*[clinic input]
13446str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013447
INADA Naoki3ae20562017-01-16 20:41:20 +090013448 sep: object
13449 /
13450
13451Partition the string into three parts using the given separator.
13452
13453This will search for the separator in the string. If the separator is found,
13454returns a 3-tuple containing the part before the separator, the separator
13455itself, and the part after it.
13456
13457If the separator is not found, returns a 3-tuple containing the original string
13458and two empty strings.
13459[clinic start generated code]*/
13460
13461static PyObject *
13462unicode_partition(PyObject *self, PyObject *sep)
13463/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013464{
INADA Naoki3ae20562017-01-16 20:41:20 +090013465 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013466}
13467
INADA Naoki3ae20562017-01-16 20:41:20 +090013468/*[clinic input]
13469str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013470
INADA Naoki3ae20562017-01-16 20:41:20 +090013471Partition the string into three parts using the given separator.
13472
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013473This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013474the separator is found, returns a 3-tuple containing the part before the
13475separator, the separator itself, and the part after it.
13476
13477If the separator is not found, returns a 3-tuple containing two empty strings
13478and the original string.
13479[clinic start generated code]*/
13480
13481static PyObject *
13482unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013483/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013484{
INADA Naoki3ae20562017-01-16 20:41:20 +090013485 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013486}
13487
Alexander Belopolsky40018472011-02-26 01:02:56 +000013488PyObject *
13489PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013490{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013491 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013492 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013493
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013494 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013495}
13496
INADA Naoki3ae20562017-01-16 20:41:20 +090013497/*[clinic input]
13498str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013499
INADA Naoki3ae20562017-01-16 20:41:20 +090013500Return a list of the words in the string, using sep as the delimiter string.
13501
13502Splits are done starting at the end of the string and working to the front.
13503[clinic start generated code]*/
13504
13505static PyObject *
13506unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13507/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013508{
INADA Naoki3ae20562017-01-16 20:41:20 +090013509 if (sep == Py_None)
13510 return rsplit(self, NULL, maxsplit);
13511 if (PyUnicode_Check(sep))
13512 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013513
Victor Stinner998b8062018-09-12 00:23:25 +020013514 PyErr_Format(PyExc_TypeError,
13515 "must be str or None, not %.100s",
13516 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013517 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013518}
13519
INADA Naoki3ae20562017-01-16 20:41:20 +090013520/*[clinic input]
13521str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013522
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013523 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013524
13525Return a list of the lines in the string, breaking at line boundaries.
13526
13527Line breaks are not included in the resulting list unless keepends is given and
13528true.
13529[clinic start generated code]*/
13530
13531static PyObject *
13532unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013533/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013534{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013535 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013536}
13537
13538static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013539PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013540{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013541 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013542}
13543
INADA Naoki3ae20562017-01-16 20:41:20 +090013544/*[clinic input]
13545str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013546
INADA Naoki3ae20562017-01-16 20:41:20 +090013547Convert uppercase characters to lowercase and lowercase characters to uppercase.
13548[clinic start generated code]*/
13549
13550static PyObject *
13551unicode_swapcase_impl(PyObject *self)
13552/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013553{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013554 if (PyUnicode_READY(self) == -1)
13555 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013556 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013557}
13558
Larry Hastings61272b72014-01-07 12:41:53 -080013559/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013560
Larry Hastings31826802013-10-19 00:09:25 -070013561@staticmethod
13562str.maketrans as unicode_maketrans
13563
13564 x: object
13565
13566 y: unicode=NULL
13567
13568 z: unicode=NULL
13569
13570 /
13571
13572Return a translation table usable for str.translate().
13573
13574If there is only one argument, it must be a dictionary mapping Unicode
13575ordinals (integers) or characters to Unicode ordinals, strings or None.
13576Character keys will be then converted to ordinals.
13577If there are two arguments, they must be strings of equal length, and
13578in the resulting dictionary, each character in x will be mapped to the
13579character at the same position in y. If there is a third argument, it
13580must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013581[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013582
Larry Hastings31826802013-10-19 00:09:25 -070013583static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013584unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013585/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013586{
Georg Brandlceee0772007-11-27 23:48:05 +000013587 PyObject *new = NULL, *key, *value;
13588 Py_ssize_t i = 0;
13589 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013590
Georg Brandlceee0772007-11-27 23:48:05 +000013591 new = PyDict_New();
13592 if (!new)
13593 return NULL;
13594 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013595 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013596 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597
Georg Brandlceee0772007-11-27 23:48:05 +000013598 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013599 if (!PyUnicode_Check(x)) {
13600 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13601 "be a string if there is a second argument");
13602 goto err;
13603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013605 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13606 "arguments must have equal length");
13607 goto err;
13608 }
13609 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610 x_kind = PyUnicode_KIND(x);
13611 y_kind = PyUnicode_KIND(y);
13612 x_data = PyUnicode_DATA(x);
13613 y_data = PyUnicode_DATA(y);
13614 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13615 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013616 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013617 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013618 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013619 if (!value) {
13620 Py_DECREF(key);
13621 goto err;
13622 }
Georg Brandlceee0772007-11-27 23:48:05 +000013623 res = PyDict_SetItem(new, key, value);
13624 Py_DECREF(key);
13625 Py_DECREF(value);
13626 if (res < 0)
13627 goto err;
13628 }
13629 /* create entries for deleting chars in z */
13630 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 z_kind = PyUnicode_KIND(z);
13632 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013633 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013635 if (!key)
13636 goto err;
13637 res = PyDict_SetItem(new, key, Py_None);
13638 Py_DECREF(key);
13639 if (res < 0)
13640 goto err;
13641 }
13642 }
13643 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013644 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013645 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013646
Georg Brandlceee0772007-11-27 23:48:05 +000013647 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013648 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013649 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13650 "to maketrans it must be a dict");
13651 goto err;
13652 }
13653 /* copy entries into the new dict, converting string keys to int keys */
13654 while (PyDict_Next(x, &i, &key, &value)) {
13655 if (PyUnicode_Check(key)) {
13656 /* convert string keys to integer keys */
13657 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013658 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013659 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13660 "table must be of length 1");
13661 goto err;
13662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013663 kind = PyUnicode_KIND(key);
13664 data = PyUnicode_DATA(key);
13665 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013666 if (!newkey)
13667 goto err;
13668 res = PyDict_SetItem(new, newkey, value);
13669 Py_DECREF(newkey);
13670 if (res < 0)
13671 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013672 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013673 /* just keep integer keys */
13674 if (PyDict_SetItem(new, key, value) < 0)
13675 goto err;
13676 } else {
13677 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13678 "be strings or integers");
13679 goto err;
13680 }
13681 }
13682 }
13683 return new;
13684 err:
13685 Py_DECREF(new);
13686 return NULL;
13687}
13688
INADA Naoki3ae20562017-01-16 20:41:20 +090013689/*[clinic input]
13690str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691
INADA Naoki3ae20562017-01-16 20:41:20 +090013692 table: object
13693 Translation table, which must be a mapping of Unicode ordinals to
13694 Unicode ordinals, strings, or None.
13695 /
13696
13697Replace each character in the string using the given translation table.
13698
13699The table must implement lookup/indexing via __getitem__, for instance a
13700dictionary or list. If this operation raises LookupError, the character is
13701left untouched. Characters mapped to None are deleted.
13702[clinic start generated code]*/
13703
13704static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013705unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013706/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013708 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709}
13710
INADA Naoki3ae20562017-01-16 20:41:20 +090013711/*[clinic input]
13712str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013713
INADA Naoki3ae20562017-01-16 20:41:20 +090013714Return a copy of the string converted to uppercase.
13715[clinic start generated code]*/
13716
13717static PyObject *
13718unicode_upper_impl(PyObject *self)
13719/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013720{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013721 if (PyUnicode_READY(self) == -1)
13722 return NULL;
13723 if (PyUnicode_IS_ASCII(self))
13724 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013725 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726}
13727
INADA Naoki3ae20562017-01-16 20:41:20 +090013728/*[clinic input]
13729str.zfill as unicode_zfill
13730
13731 width: Py_ssize_t
13732 /
13733
13734Pad a numeric string with zeros on the left, to fill a field of the given width.
13735
13736The string is never truncated.
13737[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013738
13739static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013740unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013741/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013742{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013743 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013744 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013745 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013746 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013747 Py_UCS4 chr;
13748
Benjamin Petersonbac79492012-01-14 13:34:47 -050013749 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013751
Victor Stinnerc4b49542011-12-11 22:44:26 +010013752 if (PyUnicode_GET_LENGTH(self) >= width)
13753 return unicode_result_unchanged(self);
13754
13755 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756
13757 u = pad(self, fill, 0, '0');
13758
Walter Dörwald068325e2002-04-15 13:36:47 +000013759 if (u == NULL)
13760 return NULL;
13761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013762 kind = PyUnicode_KIND(u);
13763 data = PyUnicode_DATA(u);
13764 chr = PyUnicode_READ(kind, data, fill);
13765
13766 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013767 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013768 PyUnicode_WRITE(kind, data, 0, chr);
13769 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013770 }
13771
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013772 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013773 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013775
13776#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013777static PyObject *
13778unicode__decimal2ascii(PyObject *self)
13779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013780 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013781}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782#endif
13783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013784PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013785 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013787Return True if S starts with the specified prefix, False otherwise.\n\
13788With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013789With optional end, stop comparing S at that position.\n\
13790prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791
13792static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013793unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013794 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013795{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013796 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013797 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013798 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013799 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013800 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013801
Jesus Ceaac451502011-04-20 17:09:23 +020013802 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013803 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013804 if (PyTuple_Check(subobj)) {
13805 Py_ssize_t i;
13806 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013807 substring = PyTuple_GET_ITEM(subobj, i);
13808 if (!PyUnicode_Check(substring)) {
13809 PyErr_Format(PyExc_TypeError,
13810 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013811 "not %.100s",
13812 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013813 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013814 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013815 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013816 if (result == -1)
13817 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013818 if (result) {
13819 Py_RETURN_TRUE;
13820 }
13821 }
13822 /* nothing matched */
13823 Py_RETURN_FALSE;
13824 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013825 if (!PyUnicode_Check(subobj)) {
13826 PyErr_Format(PyExc_TypeError,
13827 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013828 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013830 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013831 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013832 if (result == -1)
13833 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013834 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835}
13836
13837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013838PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013841Return True if S ends with the specified suffix, False otherwise.\n\
13842With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013843With optional end, stop comparing S at that position.\n\
13844suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013845
13846static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013847unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013849{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013850 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013851 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013852 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013853 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013854 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013855
Jesus Ceaac451502011-04-20 17:09:23 +020013856 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013857 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013858 if (PyTuple_Check(subobj)) {
13859 Py_ssize_t i;
13860 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013861 substring = PyTuple_GET_ITEM(subobj, i);
13862 if (!PyUnicode_Check(substring)) {
13863 PyErr_Format(PyExc_TypeError,
13864 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013865 "not %.100s",
13866 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013867 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013868 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013869 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013870 if (result == -1)
13871 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013872 if (result) {
13873 Py_RETURN_TRUE;
13874 }
13875 }
13876 Py_RETURN_FALSE;
13877 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013878 if (!PyUnicode_Check(subobj)) {
13879 PyErr_Format(PyExc_TypeError,
13880 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013881 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013883 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013884 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013885 if (result == -1)
13886 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013887 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013888}
13889
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013890static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013891_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013892{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013893 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13894 writer->data = PyUnicode_DATA(writer->buffer);
13895
13896 if (!writer->readonly) {
13897 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013898 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013899 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013900 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013901 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13902 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13903 writer->kind = PyUnicode_WCHAR_KIND;
13904 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13905
Victor Stinner8f674cc2013-04-17 23:02:17 +020013906 /* Copy-on-write mode: set buffer size to 0 so
13907 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13908 * next write. */
13909 writer->size = 0;
13910 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013911}
13912
Victor Stinnerd3f08822012-05-29 12:57:52 +020013913void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013914_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013915{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013916 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013917
13918 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013919 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013920
13921 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13922 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13923 writer->kind = PyUnicode_WCHAR_KIND;
13924 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013925}
13926
Inada Naoki770847a2019-06-24 12:30:24 +090013927// Initialize _PyUnicodeWriter with initial buffer
13928static inline void
13929_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13930{
13931 memset(writer, 0, sizeof(*writer));
13932 writer->buffer = buffer;
13933 _PyUnicodeWriter_Update(writer);
13934 writer->min_length = writer->size;
13935}
13936
Victor Stinnerd3f08822012-05-29 12:57:52 +020013937int
13938_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13939 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013940{
13941 Py_ssize_t newlen;
13942 PyObject *newbuffer;
13943
Victor Stinner2740e462016-09-06 16:58:36 -070013944 assert(maxchar <= MAX_UNICODE);
13945
Victor Stinnerca9381e2015-09-22 00:58:32 +020013946 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013947 assert((maxchar > writer->maxchar && length >= 0)
13948 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013949
Victor Stinner202fdca2012-05-07 12:47:02 +020013950 if (length > PY_SSIZE_T_MAX - writer->pos) {
13951 PyErr_NoMemory();
13952 return -1;
13953 }
13954 newlen = writer->pos + length;
13955
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013956 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013957
Victor Stinnerd3f08822012-05-29 12:57:52 +020013958 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013959 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013960 if (writer->overallocate
13961 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13962 /* overallocate to limit the number of realloc() */
13963 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013964 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013965 if (newlen < writer->min_length)
13966 newlen = writer->min_length;
13967
Victor Stinnerd3f08822012-05-29 12:57:52 +020013968 writer->buffer = PyUnicode_New(newlen, maxchar);
13969 if (writer->buffer == NULL)
13970 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013971 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013972 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013973 if (writer->overallocate
13974 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13975 /* overallocate to limit the number of realloc() */
13976 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013977 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013978 if (newlen < writer->min_length)
13979 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013980
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013981 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013982 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013983 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013984 newbuffer = PyUnicode_New(newlen, maxchar);
13985 if (newbuffer == NULL)
13986 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013987 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13988 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013989 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013990 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013991 }
13992 else {
13993 newbuffer = resize_compact(writer->buffer, newlen);
13994 if (newbuffer == NULL)
13995 return -1;
13996 }
13997 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013998 }
13999 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014000 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014001 newbuffer = PyUnicode_New(writer->size, maxchar);
14002 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014003 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014004 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14005 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014006 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014007 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014008 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014009 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014010
14011#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014012}
14013
Victor Stinnerca9381e2015-09-22 00:58:32 +020014014int
14015_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14016 enum PyUnicode_Kind kind)
14017{
14018 Py_UCS4 maxchar;
14019
14020 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14021 assert(writer->kind < kind);
14022
14023 switch (kind)
14024 {
14025 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14026 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14027 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14028 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014029 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014030 }
14031
14032 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14033}
14034
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014035static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014036_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014037{
Victor Stinner2740e462016-09-06 16:58:36 -070014038 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014039 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14040 return -1;
14041 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14042 writer->pos++;
14043 return 0;
14044}
14045
14046int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014047_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14048{
14049 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14050}
14051
14052int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014053_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14054{
14055 Py_UCS4 maxchar;
14056 Py_ssize_t len;
14057
14058 if (PyUnicode_READY(str) == -1)
14059 return -1;
14060 len = PyUnicode_GET_LENGTH(str);
14061 if (len == 0)
14062 return 0;
14063 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14064 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014065 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014066 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014067 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014068 Py_INCREF(str);
14069 writer->buffer = str;
14070 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014071 writer->pos += len;
14072 return 0;
14073 }
14074 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14075 return -1;
14076 }
14077 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14078 str, 0, len);
14079 writer->pos += len;
14080 return 0;
14081}
14082
Victor Stinnere215d962012-10-06 23:03:36 +020014083int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014084_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14085 Py_ssize_t start, Py_ssize_t end)
14086{
14087 Py_UCS4 maxchar;
14088 Py_ssize_t len;
14089
14090 if (PyUnicode_READY(str) == -1)
14091 return -1;
14092
14093 assert(0 <= start);
14094 assert(end <= PyUnicode_GET_LENGTH(str));
14095 assert(start <= end);
14096
14097 if (end == 0)
14098 return 0;
14099
14100 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14101 return _PyUnicodeWriter_WriteStr(writer, str);
14102
14103 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14104 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14105 else
14106 maxchar = writer->maxchar;
14107 len = end - start;
14108
14109 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14110 return -1;
14111
14112 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14113 str, start, len);
14114 writer->pos += len;
14115 return 0;
14116}
14117
14118int
Victor Stinner4a587072013-11-19 12:54:53 +010014119_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14120 const char *ascii, Py_ssize_t len)
14121{
14122 if (len == -1)
14123 len = strlen(ascii);
14124
Andy Lestere6be9b52020-02-11 20:28:35 -060014125 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014126
14127 if (writer->buffer == NULL && !writer->overallocate) {
14128 PyObject *str;
14129
14130 str = _PyUnicode_FromASCII(ascii, len);
14131 if (str == NULL)
14132 return -1;
14133
14134 writer->readonly = 1;
14135 writer->buffer = str;
14136 _PyUnicodeWriter_Update(writer);
14137 writer->pos += len;
14138 return 0;
14139 }
14140
14141 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14142 return -1;
14143
14144 switch (writer->kind)
14145 {
14146 case PyUnicode_1BYTE_KIND:
14147 {
14148 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14149 Py_UCS1 *data = writer->data;
14150
Christian Heimesf051e432016-09-13 20:22:02 +020014151 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014152 break;
14153 }
14154 case PyUnicode_2BYTE_KIND:
14155 {
14156 _PyUnicode_CONVERT_BYTES(
14157 Py_UCS1, Py_UCS2,
14158 ascii, ascii + len,
14159 (Py_UCS2 *)writer->data + writer->pos);
14160 break;
14161 }
14162 case PyUnicode_4BYTE_KIND:
14163 {
14164 _PyUnicode_CONVERT_BYTES(
14165 Py_UCS1, Py_UCS4,
14166 ascii, ascii + len,
14167 (Py_UCS4 *)writer->data + writer->pos);
14168 break;
14169 }
14170 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014171 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014172 }
14173
14174 writer->pos += len;
14175 return 0;
14176}
14177
14178int
14179_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14180 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014181{
14182 Py_UCS4 maxchar;
14183
Andy Lestere6be9b52020-02-11 20:28:35 -060014184 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014185 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14186 return -1;
14187 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14188 writer->pos += len;
14189 return 0;
14190}
14191
Victor Stinnerd3f08822012-05-29 12:57:52 +020014192PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014193_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014194{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014195 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014196
Victor Stinnerd3f08822012-05-29 12:57:52 +020014197 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014198 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014199 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014200 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014201
14202 str = writer->buffer;
14203 writer->buffer = NULL;
14204
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014205 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014206 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14207 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014208 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014209
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014210 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14211 PyObject *str2;
14212 str2 = resize_compact(str, writer->pos);
14213 if (str2 == NULL) {
14214 Py_DECREF(str);
14215 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014216 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014217 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014218 }
14219
Victor Stinner15a0bd32013-07-08 22:29:55 +020014220 assert(_PyUnicode_CheckConsistency(str, 1));
14221 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014222}
14223
Victor Stinnerd3f08822012-05-29 12:57:52 +020014224void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014225_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014226{
14227 Py_CLEAR(writer->buffer);
14228}
14229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014230#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014231
14232PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014233 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014234\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014235Return a formatted version of S, using substitutions from args and kwargs.\n\
14236The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014237
Eric Smith27bbca62010-11-04 17:06:58 +000014238PyDoc_STRVAR(format_map__doc__,
14239 "S.format_map(mapping) -> str\n\
14240\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014241Return a formatted version of S, using substitutions from mapping.\n\
14242The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014243
INADA Naoki3ae20562017-01-16 20:41:20 +090014244/*[clinic input]
14245str.__format__ as unicode___format__
14246
14247 format_spec: unicode
14248 /
14249
14250Return a formatted version of the string as described by format_spec.
14251[clinic start generated code]*/
14252
Eric Smith4a7d76d2008-05-30 18:10:19 +000014253static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014254unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014255/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014256{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014257 _PyUnicodeWriter writer;
14258 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014259
Victor Stinnerd3f08822012-05-29 12:57:52 +020014260 if (PyUnicode_READY(self) == -1)
14261 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014262 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014263 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14264 self, format_spec, 0,
14265 PyUnicode_GET_LENGTH(format_spec));
14266 if (ret == -1) {
14267 _PyUnicodeWriter_Dealloc(&writer);
14268 return NULL;
14269 }
14270 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014271}
14272
INADA Naoki3ae20562017-01-16 20:41:20 +090014273/*[clinic input]
14274str.__sizeof__ as unicode_sizeof
14275
14276Return the size of the string in memory, in bytes.
14277[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014278
14279static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014280unicode_sizeof_impl(PyObject *self)
14281/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014283 Py_ssize_t size;
14284
14285 /* If it's a compact object, account for base structure +
14286 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014287 if (PyUnicode_IS_COMPACT_ASCII(self))
14288 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14289 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014290 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014291 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014292 else {
14293 /* If it is a two-block object, account for base object, and
14294 for character block if present. */
14295 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014296 if (_PyUnicode_DATA_ANY(self))
14297 size += (PyUnicode_GET_LENGTH(self) + 1) *
14298 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 }
14300 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014301 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014302 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14303 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14304 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14305 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014306
14307 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014308}
14309
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014310static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014311unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014312{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014313 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014314 if (!copy)
14315 return NULL;
14316 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014317}
14318
Guido van Rossumd57fd912000-03-10 22:53:23 +000014319static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014320 UNICODE_ENCODE_METHODDEF
14321 UNICODE_REPLACE_METHODDEF
14322 UNICODE_SPLIT_METHODDEF
14323 UNICODE_RSPLIT_METHODDEF
14324 UNICODE_JOIN_METHODDEF
14325 UNICODE_CAPITALIZE_METHODDEF
14326 UNICODE_CASEFOLD_METHODDEF
14327 UNICODE_TITLE_METHODDEF
14328 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014329 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014330 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014331 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014332 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014333 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014334 UNICODE_LJUST_METHODDEF
14335 UNICODE_LOWER_METHODDEF
14336 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014337 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14338 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014339 UNICODE_RJUST_METHODDEF
14340 UNICODE_RSTRIP_METHODDEF
14341 UNICODE_RPARTITION_METHODDEF
14342 UNICODE_SPLITLINES_METHODDEF
14343 UNICODE_STRIP_METHODDEF
14344 UNICODE_SWAPCASE_METHODDEF
14345 UNICODE_TRANSLATE_METHODDEF
14346 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014347 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14348 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014349 UNICODE_REMOVEPREFIX_METHODDEF
14350 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014351 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014352 UNICODE_ISLOWER_METHODDEF
14353 UNICODE_ISUPPER_METHODDEF
14354 UNICODE_ISTITLE_METHODDEF
14355 UNICODE_ISSPACE_METHODDEF
14356 UNICODE_ISDECIMAL_METHODDEF
14357 UNICODE_ISDIGIT_METHODDEF
14358 UNICODE_ISNUMERIC_METHODDEF
14359 UNICODE_ISALPHA_METHODDEF
14360 UNICODE_ISALNUM_METHODDEF
14361 UNICODE_ISIDENTIFIER_METHODDEF
14362 UNICODE_ISPRINTABLE_METHODDEF
14363 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014364 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014365 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014366 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014367 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014368 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014369#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014370 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014371 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014372#endif
14373
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014374 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014375 {NULL, NULL}
14376};
14377
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014378static PyObject *
14379unicode_mod(PyObject *v, PyObject *w)
14380{
Brian Curtindfc80e32011-08-10 20:28:54 -050014381 if (!PyUnicode_Check(v))
14382 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014383 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014384}
14385
14386static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014387 0, /*nb_add*/
14388 0, /*nb_subtract*/
14389 0, /*nb_multiply*/
14390 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014391};
14392
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014394 (lenfunc) unicode_length, /* sq_length */
14395 PyUnicode_Concat, /* sq_concat */
14396 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14397 (ssizeargfunc) unicode_getitem, /* sq_item */
14398 0, /* sq_slice */
14399 0, /* sq_ass_item */
14400 0, /* sq_ass_slice */
14401 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014402};
14403
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014404static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014405unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014407 if (PyUnicode_READY(self) == -1)
14408 return NULL;
14409
Victor Stinnera15e2602020-04-08 02:01:56 +020014410 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014411 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014412 if (i == -1 && PyErr_Occurred())
14413 return NULL;
14414 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014415 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014416 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014417 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014418 Py_ssize_t start, stop, step, slicelength, i;
14419 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014420 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014421 const void *src_data;
14422 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014423 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014424 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014425
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014426 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014427 return NULL;
14428 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014429 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14430 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014431
14432 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014433 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014434 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014435 slicelength == PyUnicode_GET_LENGTH(self)) {
14436 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014437 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014438 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014439 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014440 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014441 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014442 src_kind = PyUnicode_KIND(self);
14443 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014444 if (!PyUnicode_IS_ASCII(self)) {
14445 kind_limit = kind_maxchar_limit(src_kind);
14446 max_char = 0;
14447 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14448 ch = PyUnicode_READ(src_kind, src_data, cur);
14449 if (ch > max_char) {
14450 max_char = ch;
14451 if (max_char >= kind_limit)
14452 break;
14453 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014454 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014455 }
Victor Stinner55c99112011-10-13 01:17:06 +020014456 else
14457 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014458 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014459 if (result == NULL)
14460 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014461 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014462 dest_data = PyUnicode_DATA(result);
14463
14464 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014465 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14466 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014467 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014468 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014469 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014470 } else {
14471 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14472 return NULL;
14473 }
14474}
14475
14476static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014477 (lenfunc)unicode_length, /* mp_length */
14478 (binaryfunc)unicode_subscript, /* mp_subscript */
14479 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014480};
14481
Guido van Rossumd57fd912000-03-10 22:53:23 +000014482
Guido van Rossumd57fd912000-03-10 22:53:23 +000014483/* Helpers for PyUnicode_Format() */
14484
Victor Stinnera47082312012-10-04 02:19:54 +020014485struct unicode_formatter_t {
14486 PyObject *args;
14487 int args_owned;
14488 Py_ssize_t arglen, argidx;
14489 PyObject *dict;
14490
14491 enum PyUnicode_Kind fmtkind;
14492 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014493 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014494 PyObject *fmtstr;
14495
14496 _PyUnicodeWriter writer;
14497};
14498
14499struct unicode_format_arg_t {
14500 Py_UCS4 ch;
14501 int flags;
14502 Py_ssize_t width;
14503 int prec;
14504 int sign;
14505};
14506
Guido van Rossumd57fd912000-03-10 22:53:23 +000014507static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014508unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014509{
Victor Stinnera47082312012-10-04 02:19:54 +020014510 Py_ssize_t argidx = ctx->argidx;
14511
14512 if (argidx < ctx->arglen) {
14513 ctx->argidx++;
14514 if (ctx->arglen < 0)
14515 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014516 else
Victor Stinnera47082312012-10-04 02:19:54 +020014517 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014518 }
14519 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014520 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014521 return NULL;
14522}
14523
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014524/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014525
Victor Stinnera47082312012-10-04 02:19:54 +020014526/* Format a float into the writer if the writer is not NULL, or into *p_output
14527 otherwise.
14528
14529 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014530static int
Victor Stinnera47082312012-10-04 02:19:54 +020014531formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14532 PyObject **p_output,
14533 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014534{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014535 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014536 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014537 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014538 int prec;
14539 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014540
Guido van Rossumd57fd912000-03-10 22:53:23 +000014541 x = PyFloat_AsDouble(v);
14542 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014543 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014544
Victor Stinnera47082312012-10-04 02:19:54 +020014545 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014546 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014547 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014548
Victor Stinnera47082312012-10-04 02:19:54 +020014549 if (arg->flags & F_ALT)
14550 dtoa_flags = Py_DTSF_ALT;
14551 else
14552 dtoa_flags = 0;
14553 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014554 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014555 return -1;
14556 len = strlen(p);
14557 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014558 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014559 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014560 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014561 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014562 }
14563 else
14564 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014565 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014566 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014567}
14568
Victor Stinnerd0880d52012-04-27 23:40:13 +020014569/* formatlong() emulates the format codes d, u, o, x and X, and
14570 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14571 * Python's regular ints.
14572 * Return value: a new PyUnicodeObject*, or NULL if error.
14573 * The output string is of the form
14574 * "-"? ("0x" | "0X")? digit+
14575 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14576 * set in flags. The case of hex digits will be correct,
14577 * There will be at least prec digits, zero-filled on the left if
14578 * necessary to get that many.
14579 * val object to be converted
14580 * flags bitmask of format flags; only F_ALT is looked at
14581 * prec minimum number of digits; 0-fill on left if needed
14582 * type a character in [duoxX]; u acts the same as d
14583 *
14584 * CAUTION: o, x and X conversions on regular ints can never
14585 * produce a '-' sign, but can for Python's unbounded ints.
14586 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014587PyObject *
14588_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014589{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014590 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014591 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014592 Py_ssize_t i;
14593 int sign; /* 1 if '-', else 0 */
14594 int len; /* number of characters */
14595 Py_ssize_t llen;
14596 int numdigits; /* len == numnondigits + numdigits */
14597 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014598
Victor Stinnerd0880d52012-04-27 23:40:13 +020014599 /* Avoid exceeding SSIZE_T_MAX */
14600 if (prec > INT_MAX-3) {
14601 PyErr_SetString(PyExc_OverflowError,
14602 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014603 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014604 }
14605
14606 assert(PyLong_Check(val));
14607
14608 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014609 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014610 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014611 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014612 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014613 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014614 /* int and int subclasses should print numerically when a numeric */
14615 /* format code is used (see issue18780) */
14616 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014617 break;
14618 case 'o':
14619 numnondigits = 2;
14620 result = PyNumber_ToBase(val, 8);
14621 break;
14622 case 'x':
14623 case 'X':
14624 numnondigits = 2;
14625 result = PyNumber_ToBase(val, 16);
14626 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014627 }
14628 if (!result)
14629 return NULL;
14630
14631 assert(unicode_modifiable(result));
14632 assert(PyUnicode_IS_READY(result));
14633 assert(PyUnicode_IS_ASCII(result));
14634
14635 /* To modify the string in-place, there can only be one reference. */
14636 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014637 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014638 PyErr_BadInternalCall();
14639 return NULL;
14640 }
14641 buf = PyUnicode_DATA(result);
14642 llen = PyUnicode_GET_LENGTH(result);
14643 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014644 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014645 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014646 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014647 return NULL;
14648 }
14649 len = (int)llen;
14650 sign = buf[0] == '-';
14651 numnondigits += sign;
14652 numdigits = len - numnondigits;
14653 assert(numdigits > 0);
14654
14655 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014656 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014657 (type == 'o' || type == 'x' || type == 'X'))) {
14658 assert(buf[sign] == '0');
14659 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14660 buf[sign+1] == 'o');
14661 numnondigits -= 2;
14662 buf += 2;
14663 len -= 2;
14664 if (sign)
14665 buf[0] = '-';
14666 assert(len == numnondigits + numdigits);
14667 assert(numdigits > 0);
14668 }
14669
14670 /* Fill with leading zeroes to meet minimum width. */
14671 if (prec > numdigits) {
14672 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14673 numnondigits + prec);
14674 char *b1;
14675 if (!r1) {
14676 Py_DECREF(result);
14677 return NULL;
14678 }
14679 b1 = PyBytes_AS_STRING(r1);
14680 for (i = 0; i < numnondigits; ++i)
14681 *b1++ = *buf++;
14682 for (i = 0; i < prec - numdigits; i++)
14683 *b1++ = '0';
14684 for (i = 0; i < numdigits; i++)
14685 *b1++ = *buf++;
14686 *b1 = '\0';
14687 Py_DECREF(result);
14688 result = r1;
14689 buf = PyBytes_AS_STRING(result);
14690 len = numnondigits + prec;
14691 }
14692
14693 /* Fix up case for hex conversions. */
14694 if (type == 'X') {
14695 /* Need to convert all lower case letters to upper case.
14696 and need to convert 0x to 0X (and -0x to -0X). */
14697 for (i = 0; i < len; i++)
14698 if (buf[i] >= 'a' && buf[i] <= 'x')
14699 buf[i] -= 'a'-'A';
14700 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014701 if (!PyUnicode_Check(result)
14702 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014703 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014704 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014705 Py_DECREF(result);
14706 result = unicode;
14707 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014708 else if (len != PyUnicode_GET_LENGTH(result)) {
14709 if (PyUnicode_Resize(&result, len) < 0)
14710 Py_CLEAR(result);
14711 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014712 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014713}
14714
Ethan Furmandf3ed242014-01-05 06:50:30 -080014715/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014716 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014717 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014718 * -1 and raise an exception on error */
14719static int
Victor Stinnera47082312012-10-04 02:19:54 +020014720mainformatlong(PyObject *v,
14721 struct unicode_format_arg_t *arg,
14722 PyObject **p_output,
14723 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014724{
14725 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014726 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014727
14728 if (!PyNumber_Check(v))
14729 goto wrongtype;
14730
Ethan Furman9ab74802014-03-21 06:38:46 -070014731 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014732 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014733 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014734 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014735 }
14736 else {
14737 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014738 }
14739 if (iobj == NULL ) {
14740 if (PyErr_ExceptionMatches(PyExc_TypeError))
14741 goto wrongtype;
14742 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014743 }
14744 assert(PyLong_Check(iobj));
14745 }
14746 else {
14747 iobj = v;
14748 Py_INCREF(iobj);
14749 }
14750
14751 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014752 && arg->width == -1 && arg->prec == -1
14753 && !(arg->flags & (F_SIGN | F_BLANK))
14754 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014755 {
14756 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014757 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014758 int base;
14759
Victor Stinnera47082312012-10-04 02:19:54 +020014760 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014761 {
14762 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014763 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014764 case 'd':
14765 case 'i':
14766 case 'u':
14767 base = 10;
14768 break;
14769 case 'o':
14770 base = 8;
14771 break;
14772 case 'x':
14773 case 'X':
14774 base = 16;
14775 break;
14776 }
14777
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014778 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14779 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014780 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014781 }
14782 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014783 return 1;
14784 }
14785
Ethan Furmanb95b5612015-01-23 20:05:18 -080014786 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014787 Py_DECREF(iobj);
14788 if (res == NULL)
14789 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014790 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014791 return 0;
14792
14793wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014794 switch(type)
14795 {
14796 case 'o':
14797 case 'x':
14798 case 'X':
14799 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014800 "%%%c format: an integer is required, "
14801 "not %.200s",
14802 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014803 break;
14804 default:
14805 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014806 "%%%c format: a number is required, "
14807 "not %.200s",
14808 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014809 break;
14810 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014811 return -1;
14812}
14813
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014814static Py_UCS4
14815formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014816{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014817 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014818 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014819 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014820 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014821 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014822 goto onError;
14823 }
14824 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014825 int overflow;
14826 long x = PyLong_AsLongAndOverflow(v, &overflow);
14827 if (x == -1 && PyErr_Occurred()) {
14828 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014829 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014830 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014831 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014832 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014833
Victor Stinner8faf8212011-12-08 22:14:11 +010014834 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014835 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014836 PyErr_SetString(PyExc_OverflowError,
14837 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014838 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014839 }
14840
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014841 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014842 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014843
Benjamin Peterson29060642009-01-31 22:14:21 +000014844 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014845 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014846 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014847 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014848}
14849
Victor Stinnera47082312012-10-04 02:19:54 +020014850/* Parse options of an argument: flags, width, precision.
14851 Handle also "%(name)" syntax.
14852
14853 Return 0 if the argument has been formatted into arg->str.
14854 Return 1 if the argument has been written into ctx->writer,
14855 Raise an exception and return -1 on error. */
14856static int
14857unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14858 struct unicode_format_arg_t *arg)
14859{
14860#define FORMAT_READ(ctx) \
14861 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14862
14863 PyObject *v;
14864
Victor Stinnera47082312012-10-04 02:19:54 +020014865 if (arg->ch == '(') {
14866 /* Get argument value from a dictionary. Example: "%(name)s". */
14867 Py_ssize_t keystart;
14868 Py_ssize_t keylen;
14869 PyObject *key;
14870 int pcount = 1;
14871
14872 if (ctx->dict == NULL) {
14873 PyErr_SetString(PyExc_TypeError,
14874 "format requires a mapping");
14875 return -1;
14876 }
14877 ++ctx->fmtpos;
14878 --ctx->fmtcnt;
14879 keystart = ctx->fmtpos;
14880 /* Skip over balanced parentheses */
14881 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14882 arg->ch = FORMAT_READ(ctx);
14883 if (arg->ch == ')')
14884 --pcount;
14885 else if (arg->ch == '(')
14886 ++pcount;
14887 ctx->fmtpos++;
14888 }
14889 keylen = ctx->fmtpos - keystart - 1;
14890 if (ctx->fmtcnt < 0 || pcount > 0) {
14891 PyErr_SetString(PyExc_ValueError,
14892 "incomplete format key");
14893 return -1;
14894 }
14895 key = PyUnicode_Substring(ctx->fmtstr,
14896 keystart, keystart + keylen);
14897 if (key == NULL)
14898 return -1;
14899 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014900 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014901 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014902 }
14903 ctx->args = PyObject_GetItem(ctx->dict, key);
14904 Py_DECREF(key);
14905 if (ctx->args == NULL)
14906 return -1;
14907 ctx->args_owned = 1;
14908 ctx->arglen = -1;
14909 ctx->argidx = -2;
14910 }
14911
14912 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014913 while (--ctx->fmtcnt >= 0) {
14914 arg->ch = FORMAT_READ(ctx);
14915 ctx->fmtpos++;
14916 switch (arg->ch) {
14917 case '-': arg->flags |= F_LJUST; continue;
14918 case '+': arg->flags |= F_SIGN; continue;
14919 case ' ': arg->flags |= F_BLANK; continue;
14920 case '#': arg->flags |= F_ALT; continue;
14921 case '0': arg->flags |= F_ZERO; continue;
14922 }
14923 break;
14924 }
14925
14926 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014927 if (arg->ch == '*') {
14928 v = unicode_format_getnextarg(ctx);
14929 if (v == NULL)
14930 return -1;
14931 if (!PyLong_Check(v)) {
14932 PyErr_SetString(PyExc_TypeError,
14933 "* wants int");
14934 return -1;
14935 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014936 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014937 if (arg->width == -1 && PyErr_Occurred())
14938 return -1;
14939 if (arg->width < 0) {
14940 arg->flags |= F_LJUST;
14941 arg->width = -arg->width;
14942 }
14943 if (--ctx->fmtcnt >= 0) {
14944 arg->ch = FORMAT_READ(ctx);
14945 ctx->fmtpos++;
14946 }
14947 }
14948 else if (arg->ch >= '0' && arg->ch <= '9') {
14949 arg->width = arg->ch - '0';
14950 while (--ctx->fmtcnt >= 0) {
14951 arg->ch = FORMAT_READ(ctx);
14952 ctx->fmtpos++;
14953 if (arg->ch < '0' || arg->ch > '9')
14954 break;
14955 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14956 mixing signed and unsigned comparison. Since arg->ch is between
14957 '0' and '9', casting to int is safe. */
14958 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14959 PyErr_SetString(PyExc_ValueError,
14960 "width too big");
14961 return -1;
14962 }
14963 arg->width = arg->width*10 + (arg->ch - '0');
14964 }
14965 }
14966
14967 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014968 if (arg->ch == '.') {
14969 arg->prec = 0;
14970 if (--ctx->fmtcnt >= 0) {
14971 arg->ch = FORMAT_READ(ctx);
14972 ctx->fmtpos++;
14973 }
14974 if (arg->ch == '*') {
14975 v = unicode_format_getnextarg(ctx);
14976 if (v == NULL)
14977 return -1;
14978 if (!PyLong_Check(v)) {
14979 PyErr_SetString(PyExc_TypeError,
14980 "* wants int");
14981 return -1;
14982 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014983 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014984 if (arg->prec == -1 && PyErr_Occurred())
14985 return -1;
14986 if (arg->prec < 0)
14987 arg->prec = 0;
14988 if (--ctx->fmtcnt >= 0) {
14989 arg->ch = FORMAT_READ(ctx);
14990 ctx->fmtpos++;
14991 }
14992 }
14993 else if (arg->ch >= '0' && arg->ch <= '9') {
14994 arg->prec = arg->ch - '0';
14995 while (--ctx->fmtcnt >= 0) {
14996 arg->ch = FORMAT_READ(ctx);
14997 ctx->fmtpos++;
14998 if (arg->ch < '0' || arg->ch > '9')
14999 break;
15000 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15001 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015002 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015003 return -1;
15004 }
15005 arg->prec = arg->prec*10 + (arg->ch - '0');
15006 }
15007 }
15008 }
15009
15010 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15011 if (ctx->fmtcnt >= 0) {
15012 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15013 if (--ctx->fmtcnt >= 0) {
15014 arg->ch = FORMAT_READ(ctx);
15015 ctx->fmtpos++;
15016 }
15017 }
15018 }
15019 if (ctx->fmtcnt < 0) {
15020 PyErr_SetString(PyExc_ValueError,
15021 "incomplete format");
15022 return -1;
15023 }
15024 return 0;
15025
15026#undef FORMAT_READ
15027}
15028
15029/* Format one argument. Supported conversion specifiers:
15030
15031 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015032 - "i", "d", "u": int or float
15033 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015034 - "e", "E", "f", "F", "g", "G": float
15035 - "c": int or str (1 character)
15036
Victor Stinner8dbd4212012-12-04 09:30:24 +010015037 When possible, the output is written directly into the Unicode writer
15038 (ctx->writer). A string is created when padding is required.
15039
Victor Stinnera47082312012-10-04 02:19:54 +020015040 Return 0 if the argument has been formatted into *p_str,
15041 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015042 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015043static int
15044unicode_format_arg_format(struct unicode_formatter_t *ctx,
15045 struct unicode_format_arg_t *arg,
15046 PyObject **p_str)
15047{
15048 PyObject *v;
15049 _PyUnicodeWriter *writer = &ctx->writer;
15050
15051 if (ctx->fmtcnt == 0)
15052 ctx->writer.overallocate = 0;
15053
Victor Stinnera47082312012-10-04 02:19:54 +020015054 v = unicode_format_getnextarg(ctx);
15055 if (v == NULL)
15056 return -1;
15057
Victor Stinnera47082312012-10-04 02:19:54 +020015058
15059 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015060 case 's':
15061 case 'r':
15062 case 'a':
15063 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15064 /* Fast path */
15065 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15066 return -1;
15067 return 1;
15068 }
15069
15070 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15071 *p_str = v;
15072 Py_INCREF(*p_str);
15073 }
15074 else {
15075 if (arg->ch == 's')
15076 *p_str = PyObject_Str(v);
15077 else if (arg->ch == 'r')
15078 *p_str = PyObject_Repr(v);
15079 else
15080 *p_str = PyObject_ASCII(v);
15081 }
15082 break;
15083
15084 case 'i':
15085 case 'd':
15086 case 'u':
15087 case 'o':
15088 case 'x':
15089 case 'X':
15090 {
15091 int ret = mainformatlong(v, arg, p_str, writer);
15092 if (ret != 0)
15093 return ret;
15094 arg->sign = 1;
15095 break;
15096 }
15097
15098 case 'e':
15099 case 'E':
15100 case 'f':
15101 case 'F':
15102 case 'g':
15103 case 'G':
15104 if (arg->width == -1 && arg->prec == -1
15105 && !(arg->flags & (F_SIGN | F_BLANK)))
15106 {
15107 /* Fast path */
15108 if (formatfloat(v, arg, NULL, writer) == -1)
15109 return -1;
15110 return 1;
15111 }
15112
15113 arg->sign = 1;
15114 if (formatfloat(v, arg, p_str, NULL) == -1)
15115 return -1;
15116 break;
15117
15118 case 'c':
15119 {
15120 Py_UCS4 ch = formatchar(v);
15121 if (ch == (Py_UCS4) -1)
15122 return -1;
15123 if (arg->width == -1 && arg->prec == -1) {
15124 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015125 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015126 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015127 return 1;
15128 }
15129 *p_str = PyUnicode_FromOrdinal(ch);
15130 break;
15131 }
15132
15133 default:
15134 PyErr_Format(PyExc_ValueError,
15135 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015136 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015137 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15138 (int)arg->ch,
15139 ctx->fmtpos - 1);
15140 return -1;
15141 }
15142 if (*p_str == NULL)
15143 return -1;
15144 assert (PyUnicode_Check(*p_str));
15145 return 0;
15146}
15147
15148static int
15149unicode_format_arg_output(struct unicode_formatter_t *ctx,
15150 struct unicode_format_arg_t *arg,
15151 PyObject *str)
15152{
15153 Py_ssize_t len;
15154 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015155 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015156 Py_ssize_t pindex;
15157 Py_UCS4 signchar;
15158 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015159 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015160 Py_ssize_t sublen;
15161 _PyUnicodeWriter *writer = &ctx->writer;
15162 Py_UCS4 fill;
15163
15164 fill = ' ';
15165 if (arg->sign && arg->flags & F_ZERO)
15166 fill = '0';
15167
15168 if (PyUnicode_READY(str) == -1)
15169 return -1;
15170
15171 len = PyUnicode_GET_LENGTH(str);
15172 if ((arg->width == -1 || arg->width <= len)
15173 && (arg->prec == -1 || arg->prec >= len)
15174 && !(arg->flags & (F_SIGN | F_BLANK)))
15175 {
15176 /* Fast path */
15177 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15178 return -1;
15179 return 0;
15180 }
15181
15182 /* Truncate the string for "s", "r" and "a" formats
15183 if the precision is set */
15184 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15185 if (arg->prec >= 0 && len > arg->prec)
15186 len = arg->prec;
15187 }
15188
15189 /* Adjust sign and width */
15190 kind = PyUnicode_KIND(str);
15191 pbuf = PyUnicode_DATA(str);
15192 pindex = 0;
15193 signchar = '\0';
15194 if (arg->sign) {
15195 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15196 if (ch == '-' || ch == '+') {
15197 signchar = ch;
15198 len--;
15199 pindex++;
15200 }
15201 else if (arg->flags & F_SIGN)
15202 signchar = '+';
15203 else if (arg->flags & F_BLANK)
15204 signchar = ' ';
15205 else
15206 arg->sign = 0;
15207 }
15208 if (arg->width < len)
15209 arg->width = len;
15210
15211 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015212 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015213 if (!(arg->flags & F_LJUST)) {
15214 if (arg->sign) {
15215 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015216 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015217 }
15218 else {
15219 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015220 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015221 }
15222 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015223 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15224 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015225 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015226 }
15227
Victor Stinnera47082312012-10-04 02:19:54 +020015228 buflen = arg->width;
15229 if (arg->sign && len == arg->width)
15230 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015231 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015232 return -1;
15233
15234 /* Write the sign if needed */
15235 if (arg->sign) {
15236 if (fill != ' ') {
15237 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15238 writer->pos += 1;
15239 }
15240 if (arg->width > len)
15241 arg->width--;
15242 }
15243
15244 /* Write the numeric prefix for "x", "X" and "o" formats
15245 if the alternate form is used.
15246 For example, write "0x" for the "%#x" format. */
15247 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15248 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15249 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15250 if (fill != ' ') {
15251 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15252 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15253 writer->pos += 2;
15254 pindex += 2;
15255 }
15256 arg->width -= 2;
15257 if (arg->width < 0)
15258 arg->width = 0;
15259 len -= 2;
15260 }
15261
15262 /* Pad left with the fill character if needed */
15263 if (arg->width > len && !(arg->flags & F_LJUST)) {
15264 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015265 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015266 writer->pos += sublen;
15267 arg->width = len;
15268 }
15269
15270 /* If padding with spaces: write sign if needed and/or numeric prefix if
15271 the alternate form is used */
15272 if (fill == ' ') {
15273 if (arg->sign) {
15274 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15275 writer->pos += 1;
15276 }
15277 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15278 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15279 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15280 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15281 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15282 writer->pos += 2;
15283 pindex += 2;
15284 }
15285 }
15286
15287 /* Write characters */
15288 if (len) {
15289 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15290 str, pindex, len);
15291 writer->pos += len;
15292 }
15293
15294 /* Pad right with the fill character if needed */
15295 if (arg->width > len) {
15296 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015297 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015298 writer->pos += sublen;
15299 }
15300 return 0;
15301}
15302
15303/* Helper of PyUnicode_Format(): format one arg.
15304 Return 0 on success, raise an exception and return -1 on error. */
15305static int
15306unicode_format_arg(struct unicode_formatter_t *ctx)
15307{
15308 struct unicode_format_arg_t arg;
15309 PyObject *str;
15310 int ret;
15311
Victor Stinner8dbd4212012-12-04 09:30:24 +010015312 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015313 if (arg.ch == '%') {
15314 ctx->fmtpos++;
15315 ctx->fmtcnt--;
15316 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15317 return -1;
15318 return 0;
15319 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015320 arg.flags = 0;
15321 arg.width = -1;
15322 arg.prec = -1;
15323 arg.sign = 0;
15324 str = NULL;
15325
Victor Stinnera47082312012-10-04 02:19:54 +020015326 ret = unicode_format_arg_parse(ctx, &arg);
15327 if (ret == -1)
15328 return -1;
15329
15330 ret = unicode_format_arg_format(ctx, &arg, &str);
15331 if (ret == -1)
15332 return -1;
15333
15334 if (ret != 1) {
15335 ret = unicode_format_arg_output(ctx, &arg, str);
15336 Py_DECREF(str);
15337 if (ret == -1)
15338 return -1;
15339 }
15340
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015341 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015342 PyErr_SetString(PyExc_TypeError,
15343 "not all arguments converted during string formatting");
15344 return -1;
15345 }
15346 return 0;
15347}
15348
Alexander Belopolsky40018472011-02-26 01:02:56 +000015349PyObject *
15350PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015351{
Victor Stinnera47082312012-10-04 02:19:54 +020015352 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015353
Guido van Rossumd57fd912000-03-10 22:53:23 +000015354 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015355 PyErr_BadInternalCall();
15356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015357 }
Victor Stinnera47082312012-10-04 02:19:54 +020015358
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015359 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015360 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015361
15362 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015363 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15364 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15365 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15366 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015367
Victor Stinner8f674cc2013-04-17 23:02:17 +020015368 _PyUnicodeWriter_Init(&ctx.writer);
15369 ctx.writer.min_length = ctx.fmtcnt + 100;
15370 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015371
Guido van Rossumd57fd912000-03-10 22:53:23 +000015372 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015373 ctx.arglen = PyTuple_Size(args);
15374 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015375 }
15376 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015377 ctx.arglen = -1;
15378 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015379 }
Victor Stinnera47082312012-10-04 02:19:54 +020015380 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015381 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015382 ctx.dict = args;
15383 else
15384 ctx.dict = NULL;
15385 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015386
Victor Stinnera47082312012-10-04 02:19:54 +020015387 while (--ctx.fmtcnt >= 0) {
15388 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015389 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015390
15391 nonfmtpos = ctx.fmtpos++;
15392 while (ctx.fmtcnt >= 0 &&
15393 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15394 ctx.fmtpos++;
15395 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015396 }
Victor Stinnera47082312012-10-04 02:19:54 +020015397 if (ctx.fmtcnt < 0) {
15398 ctx.fmtpos--;
15399 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015400 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015401
Victor Stinnercfc4c132013-04-03 01:48:39 +020015402 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15403 nonfmtpos, ctx.fmtpos) < 0)
15404 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015405 }
15406 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015407 ctx.fmtpos++;
15408 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015409 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015410 }
15411 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015412
Victor Stinnera47082312012-10-04 02:19:54 +020015413 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015414 PyErr_SetString(PyExc_TypeError,
15415 "not all arguments converted during string formatting");
15416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015417 }
15418
Victor Stinnera47082312012-10-04 02:19:54 +020015419 if (ctx.args_owned) {
15420 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015421 }
Victor Stinnera47082312012-10-04 02:19:54 +020015422 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015423
Benjamin Peterson29060642009-01-31 22:14:21 +000015424 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015425 _PyUnicodeWriter_Dealloc(&ctx.writer);
15426 if (ctx.args_owned) {
15427 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015428 }
15429 return NULL;
15430}
15431
Jeremy Hylton938ace62002-07-17 16:30:39 +000015432static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015433unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15434
Tim Peters6d6c1a32001-08-02 04:15:00 +000015435static PyObject *
15436unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15437{
Benjamin Peterson29060642009-01-31 22:14:21 +000015438 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 static char *kwlist[] = {"object", "encoding", "errors", 0};
15440 char *encoding = NULL;
15441 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015442
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 if (type != &PyUnicode_Type)
15444 return unicode_subtype_new(type, args, kwds);
15445 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015446 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015447 return NULL;
15448 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015449 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 if (encoding == NULL && errors == NULL)
15451 return PyObject_Str(x);
15452 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015453 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015454}
15455
Guido van Rossume023fe02001-08-30 03:12:59 +000015456static PyObject *
15457unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15458{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015459 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015460 Py_ssize_t length, char_size;
15461 int share_wstr, share_utf8;
15462 unsigned int kind;
15463 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015464
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015466
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015467 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015468 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015470 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015471 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015472 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015473 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015474 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015475
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015476 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015477 if (self == NULL) {
15478 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015479 return NULL;
15480 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015481 kind = PyUnicode_KIND(unicode);
15482 length = PyUnicode_GET_LENGTH(unicode);
15483
15484 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015485#ifdef Py_DEBUG
15486 _PyUnicode_HASH(self) = -1;
15487#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015488 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015489#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015490 _PyUnicode_STATE(self).interned = 0;
15491 _PyUnicode_STATE(self).kind = kind;
15492 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015493 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015494 _PyUnicode_STATE(self).ready = 1;
15495 _PyUnicode_WSTR(self) = NULL;
15496 _PyUnicode_UTF8_LENGTH(self) = 0;
15497 _PyUnicode_UTF8(self) = NULL;
15498 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015499 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015500
15501 share_utf8 = 0;
15502 share_wstr = 0;
15503 if (kind == PyUnicode_1BYTE_KIND) {
15504 char_size = 1;
15505 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15506 share_utf8 = 1;
15507 }
15508 else if (kind == PyUnicode_2BYTE_KIND) {
15509 char_size = 2;
15510 if (sizeof(wchar_t) == 2)
15511 share_wstr = 1;
15512 }
15513 else {
15514 assert(kind == PyUnicode_4BYTE_KIND);
15515 char_size = 4;
15516 if (sizeof(wchar_t) == 4)
15517 share_wstr = 1;
15518 }
15519
15520 /* Ensure we won't overflow the length. */
15521 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15522 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015523 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015524 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015525 data = PyObject_MALLOC((length + 1) * char_size);
15526 if (data == NULL) {
15527 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015528 goto onError;
15529 }
15530
Victor Stinnerc3c74152011-10-02 20:39:55 +020015531 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015532 if (share_utf8) {
15533 _PyUnicode_UTF8_LENGTH(self) = length;
15534 _PyUnicode_UTF8(self) = data;
15535 }
15536 if (share_wstr) {
15537 _PyUnicode_WSTR_LENGTH(self) = length;
15538 _PyUnicode_WSTR(self) = (wchar_t *)data;
15539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015540
Christian Heimesf051e432016-09-13 20:22:02 +020015541 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015542 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015543 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015544#ifdef Py_DEBUG
15545 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15546#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015547 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015548 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015549
15550onError:
15551 Py_DECREF(unicode);
15552 Py_DECREF(self);
15553 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015554}
15555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015556PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015557"str(object='') -> str\n\
15558str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015559\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015560Create a new string object from the given object. If encoding or\n\
15561errors is specified, then the object must expose a data buffer\n\
15562that will be decoded using the given encoding and error handler.\n\
15563Otherwise, returns the result of object.__str__() (if defined)\n\
15564or repr(object).\n\
15565encoding defaults to sys.getdefaultencoding().\n\
15566errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015567
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015568static PyObject *unicode_iter(PyObject *seq);
15569
Guido van Rossumd57fd912000-03-10 22:53:23 +000015570PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015571 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015572 "str", /* tp_name */
15573 sizeof(PyUnicodeObject), /* tp_basicsize */
15574 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015575 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015576 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015577 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015578 0, /* tp_getattr */
15579 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015580 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015581 unicode_repr, /* tp_repr */
15582 &unicode_as_number, /* tp_as_number */
15583 &unicode_as_sequence, /* tp_as_sequence */
15584 &unicode_as_mapping, /* tp_as_mapping */
15585 (hashfunc) unicode_hash, /* tp_hash*/
15586 0, /* tp_call*/
15587 (reprfunc) unicode_str, /* tp_str */
15588 PyObject_GenericGetAttr, /* tp_getattro */
15589 0, /* tp_setattro */
15590 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015591 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015592 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15593 unicode_doc, /* tp_doc */
15594 0, /* tp_traverse */
15595 0, /* tp_clear */
15596 PyUnicode_RichCompare, /* tp_richcompare */
15597 0, /* tp_weaklistoffset */
15598 unicode_iter, /* tp_iter */
15599 0, /* tp_iternext */
15600 unicode_methods, /* tp_methods */
15601 0, /* tp_members */
15602 0, /* tp_getset */
15603 &PyBaseObject_Type, /* tp_base */
15604 0, /* tp_dict */
15605 0, /* tp_descr_get */
15606 0, /* tp_descr_set */
15607 0, /* tp_dictoffset */
15608 0, /* tp_init */
15609 0, /* tp_alloc */
15610 unicode_new, /* tp_new */
15611 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015612};
15613
15614/* Initialize the Unicode implementation */
15615
Victor Stinner331a6a52019-05-27 16:39:22 +020015616PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015617_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015618{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015619 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015620 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015621 0x000A, /* LINE FEED */
15622 0x000D, /* CARRIAGE RETURN */
15623 0x001C, /* FILE SEPARATOR */
15624 0x001D, /* GROUP SEPARATOR */
15625 0x001E, /* RECORD SEPARATOR */
15626 0x0085, /* NEXT LINE */
15627 0x2028, /* LINE SEPARATOR */
15628 0x2029, /* PARAGRAPH SEPARATOR */
15629 };
15630
Victor Stinner91698d82020-06-25 14:07:40 +020015631 struct _Py_unicode_state *state = &tstate->interp->unicode;
15632 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015633 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015634 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015635
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015636 if (_Py_IsMainInterpreter(tstate)) {
15637 /* initialize the linebreak bloom filter */
15638 bloom_linebreak = make_bloom_mask(
15639 PyUnicode_2BYTE_KIND, linebreak,
15640 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015641
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015642 if (PyType_Ready(&PyUnicode_Type) < 0) {
15643 return _PyStatus_ERR("Can't initialize unicode type");
15644 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015645
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015646 if (PyType_Ready(&EncodingMapType) < 0) {
15647 return _PyStatus_ERR("Can't initialize encoding map type");
15648 }
15649 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15650 return _PyStatus_ERR("Can't initialize field name iterator type");
15651 }
15652 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15653 return _PyStatus_ERR("Can't initialize formatter iter type");
15654 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015655 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015656 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015657}
15658
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015659
Walter Dörwald16807132007-05-25 13:52:07 +000015660void
15661PyUnicode_InternInPlace(PyObject **p)
15662{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015663 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015664#ifdef Py_DEBUG
15665 assert(s != NULL);
15666 assert(_PyUnicode_CHECK(s));
15667#else
Victor Stinner607b1022020-05-05 18:50:30 +020015668 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015669 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015670 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015671#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015672
Benjamin Peterson14339b62009-01-31 16:36:08 +000015673 /* If it's a subclass, we don't really know what putting
15674 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015675 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015676 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015677 }
15678
15679 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015680 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015681 }
15682
15683#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015684 if (interned == NULL) {
15685 interned = PyDict_New();
15686 if (interned == NULL) {
15687 PyErr_Clear(); /* Don't leave an exception */
15688 return;
15689 }
15690 }
Victor Stinner607b1022020-05-05 18:50:30 +020015691
15692 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015693 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015694 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015695 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015696
Berker Peksagced8d4c2016-07-25 04:40:39 +030015697 if (t == NULL) {
15698 PyErr_Clear();
15699 return;
15700 }
Victor Stinner607b1022020-05-05 18:50:30 +020015701
Berker Peksagced8d4c2016-07-25 04:40:39 +030015702 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015703 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015704 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015705 return;
15706 }
Victor Stinner607b1022020-05-05 18:50:30 +020015707
Benjamin Peterson14339b62009-01-31 16:36:08 +000015708 /* The two references in interned are not counted by refcnt.
15709 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015710 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015711 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015712#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015713}
15714
15715void
15716PyUnicode_InternImmortal(PyObject **p)
15717{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015718 PyUnicode_InternInPlace(p);
15719 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015720 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015721 Py_INCREF(*p);
15722 }
Walter Dörwald16807132007-05-25 13:52:07 +000015723}
15724
15725PyObject *
15726PyUnicode_InternFromString(const char *cp)
15727{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015728 PyObject *s = PyUnicode_FromString(cp);
15729 if (s == NULL)
15730 return NULL;
15731 PyUnicode_InternInPlace(&s);
15732 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015733}
15734
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015735
15736#if defined(WITH_VALGRIND) || defined(__INSURE__)
15737static void
15738unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015739{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015740 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015741 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015742 }
15743 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015744 if (keys == NULL || !PyList_Check(keys)) {
15745 PyErr_Clear();
15746 return;
15747 }
Walter Dörwald16807132007-05-25 13:52:07 +000015748
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015749 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015750 detector, interned unicode strings are not forcibly deallocated;
15751 rather, we give them their stolen references back, and then clear
15752 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015753
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015754 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015755#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015756 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015757
15758 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015759#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015760 for (Py_ssize_t i = 0; i < n; i++) {
15761 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015762 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015763 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015765 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015766 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015767 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015768#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015769 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015770#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015771 break;
15772 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015773 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015774#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015775 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015776#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015777 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015778 case SSTATE_NOT_INTERNED:
15779 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015780 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015781 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015783 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015784 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015785#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015786 fprintf(stderr,
15787 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15788 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015789#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015790 Py_DECREF(keys);
15791 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015792 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015793}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015794#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015795
15796
15797/********************* Unicode Iterator **************************/
15798
15799typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015800 PyObject_HEAD
15801 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015802 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015803} unicodeiterobject;
15804
15805static void
15806unicodeiter_dealloc(unicodeiterobject *it)
15807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015808 _PyObject_GC_UNTRACK(it);
15809 Py_XDECREF(it->it_seq);
15810 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015811}
15812
15813static int
15814unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15815{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015816 Py_VISIT(it->it_seq);
15817 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015818}
15819
15820static PyObject *
15821unicodeiter_next(unicodeiterobject *it)
15822{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015823 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015824
Benjamin Peterson14339b62009-01-31 16:36:08 +000015825 assert(it != NULL);
15826 seq = it->it_seq;
15827 if (seq == NULL)
15828 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015829 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015831 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15832 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015833 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015834 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15835 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015836 if (item != NULL)
15837 ++it->it_index;
15838 return item;
15839 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015840
Benjamin Peterson14339b62009-01-31 16:36:08 +000015841 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015842 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015843 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015844}
15845
15846static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015847unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015848{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015849 Py_ssize_t len = 0;
15850 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015851 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015852 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015853}
15854
15855PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15856
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015857static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015858unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015859{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015860 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015861 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015862 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015863 it->it_seq, it->it_index);
15864 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015865 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015866 if (u == NULL)
15867 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015868 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015869 }
15870}
15871
15872PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15873
15874static PyObject *
15875unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15876{
15877 Py_ssize_t index = PyLong_AsSsize_t(state);
15878 if (index == -1 && PyErr_Occurred())
15879 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015880 if (it->it_seq != NULL) {
15881 if (index < 0)
15882 index = 0;
15883 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15884 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15885 it->it_index = index;
15886 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015887 Py_RETURN_NONE;
15888}
15889
15890PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15891
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015892static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015893 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015894 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015895 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15896 reduce_doc},
15897 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15898 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015899 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015900};
15901
15902PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015903 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15904 "str_iterator", /* tp_name */
15905 sizeof(unicodeiterobject), /* tp_basicsize */
15906 0, /* tp_itemsize */
15907 /* methods */
15908 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015909 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015910 0, /* tp_getattr */
15911 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015912 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015913 0, /* tp_repr */
15914 0, /* tp_as_number */
15915 0, /* tp_as_sequence */
15916 0, /* tp_as_mapping */
15917 0, /* tp_hash */
15918 0, /* tp_call */
15919 0, /* tp_str */
15920 PyObject_GenericGetAttr, /* tp_getattro */
15921 0, /* tp_setattro */
15922 0, /* tp_as_buffer */
15923 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15924 0, /* tp_doc */
15925 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15926 0, /* tp_clear */
15927 0, /* tp_richcompare */
15928 0, /* tp_weaklistoffset */
15929 PyObject_SelfIter, /* tp_iter */
15930 (iternextfunc)unicodeiter_next, /* tp_iternext */
15931 unicodeiter_methods, /* tp_methods */
15932 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015933};
15934
15935static PyObject *
15936unicode_iter(PyObject *seq)
15937{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015938 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015939
Benjamin Peterson14339b62009-01-31 16:36:08 +000015940 if (!PyUnicode_Check(seq)) {
15941 PyErr_BadInternalCall();
15942 return NULL;
15943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015944 if (PyUnicode_READY(seq) == -1)
15945 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015946 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15947 if (it == NULL)
15948 return NULL;
15949 it->it_index = 0;
15950 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015951 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015952 _PyObject_GC_TRACK(it);
15953 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015954}
15955
Victor Stinner709d23d2019-05-02 14:56:30 -040015956static int
15957encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015958{
Victor Stinner709d23d2019-05-02 14:56:30 -040015959 int res;
15960 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15961 if (res == -2) {
15962 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15963 return -1;
15964 }
15965 if (res < 0) {
15966 PyErr_NoMemory();
15967 return -1;
15968 }
15969 return 0;
15970}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015971
Victor Stinner709d23d2019-05-02 14:56:30 -040015972
15973static int
15974config_get_codec_name(wchar_t **config_encoding)
15975{
15976 char *encoding;
15977 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15978 return -1;
15979 }
15980
15981 PyObject *name_obj = NULL;
15982 PyObject *codec = _PyCodec_Lookup(encoding);
15983 PyMem_RawFree(encoding);
15984
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015985 if (!codec)
15986 goto error;
15987
15988 name_obj = PyObject_GetAttrString(codec, "name");
15989 Py_CLEAR(codec);
15990 if (!name_obj) {
15991 goto error;
15992 }
15993
Victor Stinner709d23d2019-05-02 14:56:30 -040015994 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15995 Py_DECREF(name_obj);
15996 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015997 goto error;
15998 }
15999
Victor Stinner709d23d2019-05-02 14:56:30 -040016000 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16001 if (raw_wname == NULL) {
16002 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016003 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016004 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016005 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016006
16007 PyMem_RawFree(*config_encoding);
16008 *config_encoding = raw_wname;
16009
16010 PyMem_Free(wname);
16011 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016012
16013error:
16014 Py_XDECREF(codec);
16015 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016016 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016017}
16018
16019
Victor Stinner331a6a52019-05-27 16:39:22 +020016020static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016021init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016022{
Victor Stinner709d23d2019-05-02 14:56:30 -040016023 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016024 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016025 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016026 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016027 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016028 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016029 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016030}
16031
16032
Victor Stinner709d23d2019-05-02 14:56:30 -040016033static int
16034init_fs_codec(PyInterpreterState *interp)
16035{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016036 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016037
16038 _Py_error_handler error_handler;
16039 error_handler = get_error_handler_wide(config->filesystem_errors);
16040 if (error_handler == _Py_ERROR_UNKNOWN) {
16041 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16042 return -1;
16043 }
16044
16045 char *encoding, *errors;
16046 if (encode_wstr_utf8(config->filesystem_encoding,
16047 &encoding,
16048 "filesystem_encoding") < 0) {
16049 return -1;
16050 }
16051
16052 if (encode_wstr_utf8(config->filesystem_errors,
16053 &errors,
16054 "filesystem_errors") < 0) {
16055 PyMem_RawFree(encoding);
16056 return -1;
16057 }
16058
Victor Stinner3d17c042020-05-14 01:48:38 +020016059 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16060 PyMem_RawFree(fs_codec->encoding);
16061 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016062 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016063 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16064 PyMem_RawFree(fs_codec->errors);
16065 fs_codec->errors = errors;
16066 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016067
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016068#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016069 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016070#endif
16071
Victor Stinner709d23d2019-05-02 14:56:30 -040016072 /* At this point, PyUnicode_EncodeFSDefault() and
16073 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16074 the C implementation of the filesystem encoding. */
16075
16076 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16077 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016078 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16079 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016080 PyErr_NoMemory();
16081 return -1;
16082 }
16083 return 0;
16084}
16085
16086
Victor Stinner331a6a52019-05-27 16:39:22 +020016087static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016088init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016089{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016090 PyInterpreterState *interp = tstate->interp;
16091
Victor Stinner709d23d2019-05-02 14:56:30 -040016092 /* Update the filesystem encoding to the normalized Python codec name.
16093 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16094 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016095 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016096 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016097 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016098 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016099 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016100 }
16101
Victor Stinner709d23d2019-05-02 14:56:30 -040016102 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016103 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016104 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016105 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016106}
16107
16108
Victor Stinner331a6a52019-05-27 16:39:22 +020016109PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016110_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016111{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016112 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016113 if (_PyStatus_EXCEPTION(status)) {
16114 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016115 }
16116
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016117 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016118}
16119
16120
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016121static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016122_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016123{
Victor Stinner3d17c042020-05-14 01:48:38 +020016124 PyMem_RawFree(fs_codec->encoding);
16125 fs_codec->encoding = NULL;
16126 fs_codec->utf8 = 0;
16127 PyMem_RawFree(fs_codec->errors);
16128 fs_codec->errors = NULL;
16129 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016130}
16131
16132
Victor Stinner709d23d2019-05-02 14:56:30 -040016133#ifdef MS_WINDOWS
16134int
16135_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16136{
Victor Stinner81a7be32020-04-14 15:14:01 +020016137 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016138 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016139
16140 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16141 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16142 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16143 if (encoding == NULL || errors == NULL) {
16144 PyMem_RawFree(encoding);
16145 PyMem_RawFree(errors);
16146 PyErr_NoMemory();
16147 return -1;
16148 }
16149
16150 PyMem_RawFree(config->filesystem_encoding);
16151 config->filesystem_encoding = encoding;
16152 PyMem_RawFree(config->filesystem_errors);
16153 config->filesystem_errors = errors;
16154
16155 return init_fs_codec(interp);
16156}
16157#endif
16158
16159
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016160void
Victor Stinner3d483342019-11-22 12:27:50 +010016161_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016162{
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016163 struct _Py_unicode_state *state = &tstate->interp->unicode;
16164
16165 int is_main_interp = _Py_IsMainInterpreter(tstate);
16166 if (is_main_interp) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016167#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016168 /* Insure++ is a memory analysis tool that aids in discovering
16169 * memory leaks and other memory problems. On Python exit, the
16170 * interned string dictionaries are flagged as being in use at exit
16171 * (which it is). Under normal circumstances, this is fine because
16172 * the memory will be automatically reclaimed by the system. Under
16173 * memory debugging, it's a huge source of useless noise, so we
16174 * trade off slower shutdown for less distraction in the memory
16175 * reports. -baw
16176 */
16177 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016178#endif /* __INSURE__ */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016179 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016180
Victor Stinner91698d82020-06-25 14:07:40 +020016181 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016182
Victor Stinner2f9ada92020-06-24 02:22:21 +020016183 for (Py_ssize_t i = 0; i < 256; i++) {
16184 Py_CLEAR(state->latin1[i]);
16185 }
16186
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016187 if (is_main_interp) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016188 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016189 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016190
Victor Stinner3d17c042020-05-14 01:48:38 +020016191 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016192}
16193
16194
Georg Brandl66c221e2010-10-14 07:04:07 +000016195/* A _string module, to export formatter_parser and formatter_field_name_split
16196 to the string.Formatter class implemented in Python. */
16197
16198static PyMethodDef _string_methods[] = {
16199 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16200 METH_O, PyDoc_STR("split the argument as a field name")},
16201 {"formatter_parser", (PyCFunction) formatter_parser,
16202 METH_O, PyDoc_STR("parse the argument as a format string")},
16203 {NULL, NULL}
16204};
16205
16206static struct PyModuleDef _string_module = {
16207 PyModuleDef_HEAD_INIT,
16208 "_string",
16209 PyDoc_STR("string helper module"),
16210 0,
16211 _string_methods,
16212 NULL,
16213 NULL,
16214 NULL,
16215 NULL
16216};
16217
16218PyMODINIT_FUNC
16219PyInit__string(void)
16220{
16221 return PyModule_Create(&_string_module);
16222}
16223
16224
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016225#ifdef __cplusplus
16226}
16227#endif