blob: 55c886727ba2edd9886bbb32b5f3475686f4cff4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner91698d82020-06-25 14:07:40 +020044#include "pycore_bytes_methods.h" // _Py_bytes_lower()
45#include "pycore_initconfig.h" // _PyStatus_OK()
Victor Stinnere5014be2020-04-14 17:52:15 +020046#include "pycore_interp.h" // PyInterpreterState.fs_codec
Victor Stinner91698d82020-06-25 14:07:40 +020047#include "pycore_object.h" // _PyObject_GC_TRACK()
48#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
49#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
Victor Stinnere5014be2020-04-14 17:52:15 +020050#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner91698d82020-06-25 14:07:40 +020051#include "ucnhash.h" // _PyUnicode_Name_CAPI
52#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinnerfecc4f22019-03-19 14:20:29 +010058/* Uncomment to display statistics on interned strings at exit when
59 using Valgrind or Insecure++. */
60/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900122
123/* Don't use deprecated macro of unicodeobject.h */
124#undef PyUnicode_WSTR_LENGTH
125#define PyUnicode_WSTR_LENGTH(op) \
126 (PyUnicode_IS_COMPACT_ASCII(op) ? \
127 ((PyASCIIObject*)op)->length : \
128 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200129#define _PyUnicode_WSTR_LENGTH(op) \
130 (((PyCompactUnicodeObject*)(op))->wstr_length)
131#define _PyUnicode_LENGTH(op) \
132 (((PyASCIIObject *)(op))->length)
133#define _PyUnicode_STATE(op) \
134 (((PyASCIIObject *)(op))->state)
135#define _PyUnicode_HASH(op) \
136 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_KIND(op) \
138 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200139 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_GET_LENGTH(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200143#define _PyUnicode_DATA_ANY(op) \
144 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145
Victor Stinner910337b2011-10-03 03:20:16 +0200146#undef PyUnicode_READY
147#define PyUnicode_READY(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200150 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100151 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200152
Victor Stinnerc379ead2011-10-03 12:52:27 +0200153#define _PyUnicode_SHARE_UTF8(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
156 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
157#define _PyUnicode_SHARE_WSTR(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
160
Victor Stinner829c0ad2011-10-03 01:08:02 +0200161/* true if the Unicode object has an allocated UTF-8 memory block
162 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200163#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
167
Victor Stinner03490912011-10-03 23:45:12 +0200168/* true if the Unicode object has an allocated wstr memory block
169 (not shared with other data) */
170#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200172 (!PyUnicode_IS_READY(op) || \
173 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
174
Victor Stinner910337b2011-10-03 03:20:16 +0200175/* Generic helper macro to convert characters of different types.
176 from_type and to_type have to be valid type names, begin and end
177 are pointers to the source characters which should be of type
178 "from_type *". to is a pointer of type "to_type *" and points to the
179 buffer where the result characters are written to. */
180#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
181 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100182 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600183 const from_type *_iter = (const from_type *)(begin);\
184 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200185 Py_ssize_t n = (_end) - (_iter); \
186 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200187 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_unrolled_end)) { \
189 _to[0] = (to_type) _iter[0]; \
190 _to[1] = (to_type) _iter[1]; \
191 _to[2] = (to_type) _iter[2]; \
192 _to[3] = (to_type) _iter[3]; \
193 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200194 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_end)) \
196 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200198
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200199#ifdef MS_WINDOWS
200 /* On Windows, overallocate by 50% is the best factor */
201# define OVERALLOCATE_FACTOR 2
202#else
203 /* On Linux, overallocate by 25% is the best factor */
204# define OVERALLOCATE_FACTOR 4
205#endif
206
Victor Stinner607b1022020-05-05 18:50:30 +0200207/* bpo-40521: Interned strings are shared by all interpreters. */
208#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
209# define INTERNED_STRINGS
210#endif
211
Walter Dörwald16807132007-05-25 13:52:07 +0000212/* This dictionary holds all interned unicode strings. Note that references
213 to strings in this dictionary are *not* counted in the string's ob_refcnt.
214 When the interned string reaches a refcnt of 0 the string deallocation
215 function will delete the reference from this dictionary.
216
217 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000219*/
Victor Stinner607b1022020-05-05 18:50:30 +0200220#ifdef INTERNED_STRINGS
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221static PyObject *interned = NULL;
Victor Stinner607b1022020-05-05 18:50:30 +0200222#endif
Walter Dörwald16807132007-05-25 13:52:07 +0000223
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static struct _Py_unicode_state*
225get_unicode_state(void)
226{
227 PyInterpreterState *interp = _PyInterpreterState_GET();
228 return &interp->unicode;
229}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200230
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200232// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200233static inline PyObject* unicode_get_empty(void)
234{
235 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200236 // unicode_get_empty() must not be called before _PyUnicode_Init()
237 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200238 assert(state->empty_string != NULL);
239 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200240}
241
Victor Stinner91698d82020-06-25 14:07:40 +0200242
243// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200244static inline PyObject* unicode_new_empty(void)
245{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200246 PyObject *empty = unicode_get_empty();
247 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200248 return empty;
249}
250
251#define _Py_RETURN_UNICODE_EMPTY() \
252 do { \
253 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200254 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000255
Victor Stinner59423e32018-11-26 13:40:01 +0100256static inline void
257unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
258 Py_ssize_t start, Py_ssize_t length)
259{
260 assert(0 <= start);
261 assert(kind != PyUnicode_WCHAR_KIND);
262 switch (kind) {
263 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100264 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100265 Py_UCS1 ch = (unsigned char)value;
266 Py_UCS1 *to = (Py_UCS1 *)data + start;
267 memset(to, ch, length);
268 break;
269 }
270 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100271 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100272 Py_UCS2 ch = (Py_UCS2)value;
273 Py_UCS2 *to = (Py_UCS2 *)data + start;
274 const Py_UCS2 *end = to + length;
275 for (; to < end; ++to) *to = ch;
276 break;
277 }
278 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100279 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100280 Py_UCS4 ch = value;
281 Py_UCS4 * to = (Py_UCS4 *)data + start;
282 const Py_UCS4 *end = to + length;
283 for (; to < end; ++to) *to = ch;
284 break;
285 }
286 default: Py_UNREACHABLE();
287 }
288}
289
290
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200291/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700292static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200293_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900294static inline void
295_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400296static PyObject *
297unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
298 const char *errors);
299static PyObject *
300unicode_decode_utf8(const char *s, Py_ssize_t size,
301 _Py_error_handler error_handler, const char *errors,
302 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200303
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200304/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200305static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200306
Christian Heimes190d79e2008-01-30 11:58:22 +0000307/* Fast detection of the most frequent whitespace characters */
308const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000310/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000311/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000312/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000313/* case 0x000C: * FORM FEED */
314/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000315 0, 1, 1, 1, 1, 1, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000317/* case 0x001C: * FILE SEPARATOR */
318/* case 0x001D: * GROUP SEPARATOR */
319/* case 0x001E: * RECORD SEPARATOR */
320/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000321 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000322/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000323 1, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000327
Benjamin Peterson14339b62009-01-31 16:36:08 +0000328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0,
335 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000336};
337
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200338/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200339static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200340static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100341static int unicode_modifiable(PyObject *unicode);
342
Victor Stinnerfe226c02011-10-03 03:52:20 +0200343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100345_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200346static PyObject *
347_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
348static PyObject *
349_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
350
351static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000353 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100354 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000355 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
356
Alexander Belopolsky40018472011-02-26 01:02:56 +0000357static void
358raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300359 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100360 PyObject *unicode,
361 Py_ssize_t startpos, Py_ssize_t endpos,
362 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000363
Christian Heimes190d79e2008-01-30 11:58:22 +0000364/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200365static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000367/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000368/* 0x000B, * LINE TABULATION */
369/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000370/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000371 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000372 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000373/* 0x001C, * FILE SEPARATOR */
374/* 0x001D, * GROUP SEPARATOR */
375/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000376 0, 0, 0, 0, 1, 1, 1, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000381
Benjamin Peterson14339b62009-01-31 16:36:08 +0000382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0,
389 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000390};
391
INADA Naoki3ae20562017-01-16 20:41:20 +0900392static int convert_uc(PyObject *obj, void *addr);
393
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300394#include "clinic/unicodeobject.c.h"
395
Victor Stinner3d4226a2018-08-29 22:21:32 +0200396_Py_error_handler
397_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200398{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
408 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200409 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200412 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200415 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200418 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200419 }
Victor Stinner50149202015-09-22 00:26:54 +0200420 return _Py_ERROR_OTHER;
421}
422
Victor Stinner709d23d2019-05-02 14:56:30 -0400423
424static _Py_error_handler
425get_error_handler_wide(const wchar_t *errors)
426{
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449}
450
451
Victor Stinner22eb6892019-06-26 00:51:05 +0200452static inline int
453unicode_check_encoding_errors(const char *encoding, const char *errors)
454{
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
Victor Stinner81a7be32020-04-14 15:14:01 +0200459 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200460#ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200463 return 0;
464 }
465#else
466 /* Always check in debug mode */
467#endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200471 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200472 return 0;
473 }
474
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
Victor Stinner22eb6892019-06-26 00:51:05 +0200481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497}
498
499
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300500/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
501 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000502Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000503PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000504{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000505#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000506 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000507#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000508 /* This is actually an illegal character, so it should
509 not be passed to unichr. */
510 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000511#endif
512}
513
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200514int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100515_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200516{
Victor Stinner68762572019-10-07 18:42:01 +0200517#define CHECK(expr) \
518 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
519
Victor Stinner910337b2011-10-03 03:20:16 +0200520 PyASCIIObject *ascii;
521 unsigned int kind;
522
Victor Stinner68762572019-10-07 18:42:01 +0200523 assert(op != NULL);
524 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200525
526 ascii = (PyASCIIObject *)op;
527 kind = ascii->state.kind;
528
Victor Stinnera3b334d2011-10-03 13:53:37 +0200529 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200530 CHECK(kind == PyUnicode_1BYTE_KIND);
531 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200532 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200534 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200535 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200536
Victor Stinnera41463c2011-10-04 01:05:08 +0200537 if (ascii->state.compact == 1) {
538 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200539 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200540 || kind == PyUnicode_2BYTE_KIND
541 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(ascii->state.ascii == 0);
543 CHECK(ascii->state.ready == 1);
544 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100545 }
546 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
548
549 data = unicode->data.any;
550 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200551 CHECK(ascii->length == 0);
552 CHECK(ascii->hash == -1);
553 CHECK(ascii->state.compact == 0);
554 CHECK(ascii->state.ascii == 0);
555 CHECK(ascii->state.ready == 0);
556 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
557 CHECK(ascii->wstr != NULL);
558 CHECK(data == NULL);
559 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 }
561 else {
Victor Stinner68762572019-10-07 18:42:01 +0200562 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200563 || kind == PyUnicode_2BYTE_KIND
564 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(ascii->state.compact == 0);
566 CHECK(ascii->state.ready == 1);
567 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200568 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200569 CHECK(compact->utf8 == data);
570 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200571 }
572 else
Victor Stinner68762572019-10-07 18:42:01 +0200573 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200574 }
575 }
576 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200577 if (
578#if SIZEOF_WCHAR_T == 2
579 kind == PyUnicode_2BYTE_KIND
580#else
581 kind == PyUnicode_4BYTE_KIND
582#endif
583 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200584 {
Victor Stinner68762572019-10-07 18:42:01 +0200585 CHECK(ascii->wstr == data);
586 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200587 } else
Victor Stinner68762572019-10-07 18:42:01 +0200588 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200589 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200590
591 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200592 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200593 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200594 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200595 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200596
597 /* check that the best kind is used: O(n) operation */
598 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200599 Py_ssize_t i;
600 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300601 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200602 Py_UCS4 ch;
603
604 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200605 for (i=0; i < ascii->length; i++)
606 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200607 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200608 if (ch > maxchar)
609 maxchar = ch;
610 }
611 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100612 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200613 CHECK(maxchar >= 128);
614 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100615 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200616 else
Victor Stinner68762572019-10-07 18:42:01 +0200617 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200618 }
Victor Stinner77faf692011-11-20 18:56:05 +0100619 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200620 CHECK(maxchar >= 0x100);
621 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100622 }
623 else {
Victor Stinner68762572019-10-07 18:42:01 +0200624 CHECK(maxchar >= 0x10000);
625 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100626 }
Victor Stinner68762572019-10-07 18:42:01 +0200627 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200628 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400629 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200630
631#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400632}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200633
Victor Stinner910337b2011-10-03 03:20:16 +0200634
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100635static PyObject*
636unicode_result_wchar(PyObject *unicode)
637{
638#ifndef Py_DEBUG
639 Py_ssize_t len;
640
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100641 len = _PyUnicode_WSTR_LENGTH(unicode);
642 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200644 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100645 }
646
647 if (len == 1) {
648 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100649 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100650 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200651 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100652 }
653 }
654
655 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200656 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100657 return NULL;
658 }
659#else
Victor Stinneraa771272012-10-04 02:32:58 +0200660 assert(Py_REFCNT(unicode) == 1);
661
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100662 /* don't make the result ready in debug mode to ensure that the caller
663 makes the string ready before using it */
664 assert(_PyUnicode_CheckConsistency(unicode, 1));
665#endif
666 return unicode;
667}
668
669static PyObject*
670unicode_result_ready(PyObject *unicode)
671{
672 Py_ssize_t length;
673
674 length = PyUnicode_GET_LENGTH(unicode);
675 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200676 PyObject *empty = unicode_get_empty();
677 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100678 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200679 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100680 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200681 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100682 }
683
684 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200685 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200686 if (kind == PyUnicode_1BYTE_KIND) {
687 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
688 Py_UCS1 ch = data[0];
689 struct _Py_unicode_state *state = get_unicode_state();
690 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100691 if (latin1_char != NULL) {
692 if (unicode != latin1_char) {
693 Py_INCREF(latin1_char);
694 Py_DECREF(unicode);
695 }
696 return latin1_char;
697 }
698 else {
699 assert(_PyUnicode_CheckConsistency(unicode, 1));
700 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200701 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100702 return unicode;
703 }
704 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200705 else {
706 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
707 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100708 }
709
710 assert(_PyUnicode_CheckConsistency(unicode, 1));
711 return unicode;
712}
713
714static PyObject*
715unicode_result(PyObject *unicode)
716{
717 assert(_PyUnicode_CHECK(unicode));
718 if (PyUnicode_IS_READY(unicode))
719 return unicode_result_ready(unicode);
720 else
721 return unicode_result_wchar(unicode);
722}
723
Victor Stinnerc4b49542011-12-11 22:44:26 +0100724static PyObject*
725unicode_result_unchanged(PyObject *unicode)
726{
727 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500728 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100729 return NULL;
730 Py_INCREF(unicode);
731 return unicode;
732 }
733 else
734 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100735 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100736}
737
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200738/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
739 ASCII, Latin1, UTF-8, etc. */
740static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200741backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200742 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
743{
Victor Stinnerad771582015-10-09 12:38:53 +0200744 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200745 Py_UCS4 ch;
746 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300747 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200748
749 assert(PyUnicode_IS_READY(unicode));
750 kind = PyUnicode_KIND(unicode);
751 data = PyUnicode_DATA(unicode);
752
753 size = 0;
754 /* determine replacement size */
755 for (i = collstart; i < collend; ++i) {
756 Py_ssize_t incr;
757
758 ch = PyUnicode_READ(kind, data, i);
759 if (ch < 0x100)
760 incr = 2+2;
761 else if (ch < 0x10000)
762 incr = 2+4;
763 else {
764 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200765 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200766 }
767 if (size > PY_SSIZE_T_MAX - incr) {
768 PyErr_SetString(PyExc_OverflowError,
769 "encoded result is too long for a Python string");
770 return NULL;
771 }
772 size += incr;
773 }
774
Victor Stinnerad771582015-10-09 12:38:53 +0200775 str = _PyBytesWriter_Prepare(writer, str, size);
776 if (str == NULL)
777 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778
779 /* generate replacement */
780 for (i = collstart; i < collend; ++i) {
781 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200782 *str++ = '\\';
783 if (ch >= 0x00010000) {
784 *str++ = 'U';
785 *str++ = Py_hexdigits[(ch>>28)&0xf];
786 *str++ = Py_hexdigits[(ch>>24)&0xf];
787 *str++ = Py_hexdigits[(ch>>20)&0xf];
788 *str++ = Py_hexdigits[(ch>>16)&0xf];
789 *str++ = Py_hexdigits[(ch>>12)&0xf];
790 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200791 }
Victor Stinner797485e2015-10-09 03:17:30 +0200792 else if (ch >= 0x100) {
793 *str++ = 'u';
794 *str++ = Py_hexdigits[(ch>>12)&0xf];
795 *str++ = Py_hexdigits[(ch>>8)&0xf];
796 }
797 else
798 *str++ = 'x';
799 *str++ = Py_hexdigits[(ch>>4)&0xf];
800 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200801 }
802 return str;
803}
804
805/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
806 ASCII, Latin1, UTF-8, etc. */
807static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200808xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200809 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
810{
Victor Stinnerad771582015-10-09 12:38:53 +0200811 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200812 Py_UCS4 ch;
813 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300814 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200815
816 assert(PyUnicode_IS_READY(unicode));
817 kind = PyUnicode_KIND(unicode);
818 data = PyUnicode_DATA(unicode);
819
820 size = 0;
821 /* determine replacement size */
822 for (i = collstart; i < collend; ++i) {
823 Py_ssize_t incr;
824
825 ch = PyUnicode_READ(kind, data, i);
826 if (ch < 10)
827 incr = 2+1+1;
828 else if (ch < 100)
829 incr = 2+2+1;
830 else if (ch < 1000)
831 incr = 2+3+1;
832 else if (ch < 10000)
833 incr = 2+4+1;
834 else if (ch < 100000)
835 incr = 2+5+1;
836 else if (ch < 1000000)
837 incr = 2+6+1;
838 else {
839 assert(ch <= MAX_UNICODE);
840 incr = 2+7+1;
841 }
842 if (size > PY_SSIZE_T_MAX - incr) {
843 PyErr_SetString(PyExc_OverflowError,
844 "encoded result is too long for a Python string");
845 return NULL;
846 }
847 size += incr;
848 }
849
Victor Stinnerad771582015-10-09 12:38:53 +0200850 str = _PyBytesWriter_Prepare(writer, str, size);
851 if (str == NULL)
852 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200853
854 /* generate replacement */
855 for (i = collstart; i < collend; ++i) {
856 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
857 }
858 return str;
859}
860
Thomas Wouters477c8d52006-05-27 19:21:47 +0000861/* --- Bloom Filters ----------------------------------------------------- */
862
863/* stuff to implement simple "bloom filters" for Unicode characters.
864 to keep things simple, we use a single bitmask, using the least 5
865 bits from each unicode characters as the bit index. */
866
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200867/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000868
Antoine Pitrouf068f942010-01-13 14:19:12 +0000869#if LONG_BIT >= 128
870#define BLOOM_WIDTH 128
871#elif LONG_BIT >= 64
872#define BLOOM_WIDTH 64
873#elif LONG_BIT >= 32
874#define BLOOM_WIDTH 32
875#else
876#error "LONG_BIT is smaller than 32"
877#endif
878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879#define BLOOM_MASK unsigned long
880
Serhiy Storchaka05997252013-01-26 12:14:02 +0200881static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882
Antoine Pitrouf068f942010-01-13 14:19:12 +0000883#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884
Benjamin Peterson29060642009-01-31 22:14:21 +0000885#define BLOOM_LINEBREAK(ch) \
886 ((ch) < 128U ? ascii_linebreak[(ch)] : \
887 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000888
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700889static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300890make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891{
Victor Stinnera85af502013-04-09 21:53:54 +0200892#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
893 do { \
894 TYPE *data = (TYPE *)PTR; \
895 TYPE *end = data + LEN; \
896 Py_UCS4 ch; \
897 for (; data != end; data++) { \
898 ch = *data; \
899 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
900 } \
901 break; \
902 } while (0)
903
Thomas Wouters477c8d52006-05-27 19:21:47 +0000904 /* calculate simple bloom-style bitmask for a given unicode string */
905
Antoine Pitrouf068f942010-01-13 14:19:12 +0000906 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000907
908 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200909 switch (kind) {
910 case PyUnicode_1BYTE_KIND:
911 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
912 break;
913 case PyUnicode_2BYTE_KIND:
914 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
915 break;
916 case PyUnicode_4BYTE_KIND:
917 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
918 break;
919 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700920 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200921 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000922 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200923
924#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000925}
926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300927static int
928ensure_unicode(PyObject *obj)
929{
930 if (!PyUnicode_Check(obj)) {
931 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200932 "must be str, not %.100s",
933 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300934 return -1;
935 }
936 return PyUnicode_READY(obj);
937}
938
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200939/* Compilation of templated routines */
940
Victor Stinner90ed8a62020-06-24 00:34:07 +0200941#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200942
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/asciilib.h"
944#include "stringlib/fastsearch.h"
945#include "stringlib/partition.h"
946#include "stringlib/split.h"
947#include "stringlib/count.h"
948#include "stringlib/find.h"
949#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200950#include "stringlib/undef.h"
951
952#include "stringlib/ucs1lib.h"
953#include "stringlib/fastsearch.h"
954#include "stringlib/partition.h"
955#include "stringlib/split.h"
956#include "stringlib/count.h"
957#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300958#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200959#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200960#include "stringlib/undef.h"
961
962#include "stringlib/ucs2lib.h"
963#include "stringlib/fastsearch.h"
964#include "stringlib/partition.h"
965#include "stringlib/split.h"
966#include "stringlib/count.h"
967#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300968#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200969#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200970#include "stringlib/undef.h"
971
972#include "stringlib/ucs4lib.h"
973#include "stringlib/fastsearch.h"
974#include "stringlib/partition.h"
975#include "stringlib/split.h"
976#include "stringlib/count.h"
977#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300978#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200979#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200980#include "stringlib/undef.h"
981
Inada Naoki2c4928d2020-06-17 20:09:44 +0900982_Py_COMP_DIAG_PUSH
983_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200984#include "stringlib/unicodedefs.h"
985#include "stringlib/fastsearch.h"
986#include "stringlib/count.h"
987#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100988#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900989_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200990
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200991#undef STRINGLIB_GET_EMPTY
992
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993/* --- Unicode Object ----------------------------------------------------- */
994
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700995static inline Py_ssize_t
996findchar(const void *s, int kind,
997 Py_ssize_t size, Py_UCS4 ch,
998 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001000 switch (kind) {
1001 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001002 if ((Py_UCS1) ch != ch)
1003 return -1;
1004 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001005 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001006 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001007 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001008 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001009 if ((Py_UCS2) ch != ch)
1010 return -1;
1011 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001012 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001013 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001014 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001015 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001016 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -06001017 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +02001018 else
Andy Lestere6be9b52020-02-11 20:28:35 -06001019 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02001020 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07001021 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023}
1024
Victor Stinnerafffce42012-10-03 23:03:17 +02001025#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001026/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001027 earlier.
1028
1029 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1030 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1031 invalid character in Unicode 6.0. */
1032static void
1033unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1034{
1035 int kind = PyUnicode_KIND(unicode);
1036 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1037 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1038 if (length <= old_length)
1039 return;
1040 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1041}
1042#endif
1043
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044static PyObject*
1045resize_compact(PyObject *unicode, Py_ssize_t length)
1046{
1047 Py_ssize_t char_size;
1048 Py_ssize_t struct_size;
1049 Py_ssize_t new_size;
1050 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001051 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001052#ifdef Py_DEBUG
1053 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1054#endif
1055
Victor Stinner79891572012-05-03 13:43:07 +02001056 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001058 assert(PyUnicode_IS_COMPACT(unicode));
1059
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001060 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001061 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 struct_size = sizeof(PyASCIIObject);
1063 else
1064 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001065 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1068 PyErr_NoMemory();
1069 return NULL;
1070 }
1071 new_size = (struct_size + (length + 1) * char_size);
1072
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001073 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1074 PyObject_DEL(_PyUnicode_UTF8(unicode));
1075 _PyUnicode_UTF8(unicode) = NULL;
1076 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1077 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001078#ifdef Py_REF_DEBUG
1079 _Py_RefTotal--;
1080#endif
1081#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001082 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001083#endif
Victor Stinner84def372011-12-11 20:04:56 +01001084
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001085 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001086 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001087 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 PyErr_NoMemory();
1089 return NULL;
1090 }
Victor Stinner84def372011-12-11 20:04:56 +01001091 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001093
Victor Stinnerfe226c02011-10-03 03:52:20 +02001094 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001095 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001096 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001097 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001098 _PyUnicode_WSTR_LENGTH(unicode) = length;
1099 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001100 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1101 PyObject_DEL(_PyUnicode_WSTR(unicode));
1102 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001103 if (!PyUnicode_IS_ASCII(unicode))
1104 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001105 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001106#ifdef Py_DEBUG
1107 unicode_fill_invalid(unicode, old_length);
1108#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1110 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001111 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 return unicode;
1113}
1114
Alexander Belopolsky40018472011-02-26 01:02:56 +00001115static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001116resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117{
Victor Stinner95663112011-10-04 01:03:50 +02001118 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001119 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001122
Victor Stinnerfe226c02011-10-03 03:52:20 +02001123 if (PyUnicode_IS_READY(unicode)) {
1124 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001125 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001126 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001127#ifdef Py_DEBUG
1128 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1129#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001130
1131 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001132 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001133 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1134 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135
1136 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1137 PyErr_NoMemory();
1138 return -1;
1139 }
1140 new_size = (length + 1) * char_size;
1141
Victor Stinner7a9105a2011-12-12 00:13:42 +01001142 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1143 {
1144 PyObject_DEL(_PyUnicode_UTF8(unicode));
1145 _PyUnicode_UTF8(unicode) = NULL;
1146 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1147 }
1148
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 data = (PyObject *)PyObject_REALLOC(data, new_size);
1150 if (data == NULL) {
1151 PyErr_NoMemory();
1152 return -1;
1153 }
1154 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001155 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001157 _PyUnicode_WSTR_LENGTH(unicode) = length;
1158 }
1159 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001160 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001161 _PyUnicode_UTF8_LENGTH(unicode) = length;
1162 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001163 _PyUnicode_LENGTH(unicode) = length;
1164 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001165#ifdef Py_DEBUG
1166 unicode_fill_invalid(unicode, old_length);
1167#endif
Victor Stinner95663112011-10-04 01:03:50 +02001168 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001169 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001170 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172 }
Victor Stinner95663112011-10-04 01:03:50 +02001173 assert(_PyUnicode_WSTR(unicode) != NULL);
1174
1175 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001176 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001177 PyErr_NoMemory();
1178 return -1;
1179 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001180 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001181 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001182 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001183 if (!wstr) {
1184 PyErr_NoMemory();
1185 return -1;
1186 }
1187 _PyUnicode_WSTR(unicode) = wstr;
1188 _PyUnicode_WSTR(unicode)[length] = 0;
1189 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001190 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 return 0;
1192}
1193
Victor Stinnerfe226c02011-10-03 03:52:20 +02001194static PyObject*
1195resize_copy(PyObject *unicode, Py_ssize_t length)
1196{
1197 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001198 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001199 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001200
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001201 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001202
1203 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1204 if (copy == NULL)
1205 return NULL;
1206
1207 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001208 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001209 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001210 }
1211 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001212 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001213
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001214 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001215 if (w == NULL)
1216 return NULL;
1217 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1218 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001219 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001220 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001221 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001222 }
1223}
1224
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001226 Ux0000 terminated; some code (e.g. new_identifier)
1227 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228
1229 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001230 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231
1232*/
1233
Alexander Belopolsky40018472011-02-26 01:02:56 +00001234static PyUnicodeObject *
1235_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001237 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239
Thomas Wouters477c8d52006-05-27 19:21:47 +00001240 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001241 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001242 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001245 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001246 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001247 return (PyUnicodeObject *)PyErr_NoMemory();
1248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 if (length < 0) {
1250 PyErr_SetString(PyExc_SystemError,
1251 "Negative size passed to _PyUnicode_New");
1252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 }
1254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1256 if (unicode == NULL)
1257 return NULL;
1258 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001259
1260 _PyUnicode_WSTR_LENGTH(unicode) = length;
1261 _PyUnicode_HASH(unicode) = -1;
1262 _PyUnicode_STATE(unicode).interned = 0;
1263 _PyUnicode_STATE(unicode).kind = 0;
1264 _PyUnicode_STATE(unicode).compact = 0;
1265 _PyUnicode_STATE(unicode).ready = 0;
1266 _PyUnicode_STATE(unicode).ascii = 0;
1267 _PyUnicode_DATA_ANY(unicode) = NULL;
1268 _PyUnicode_LENGTH(unicode) = 0;
1269 _PyUnicode_UTF8(unicode) = NULL;
1270 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1273 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001274 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001275 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001276 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278
Jeremy Hyltond8082792003-09-16 19:41:39 +00001279 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001280 * the caller fails before initializing str -- unicode_resize()
1281 * reads str[0], and the Keep-Alive optimization can keep memory
1282 * allocated for str alive across a call to unicode_dealloc(unicode).
1283 * We don't want unicode_resize to read uninitialized memory in
1284 * that case.
1285 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286 _PyUnicode_WSTR(unicode)[0] = 0;
1287 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001288
Victor Stinner7931d9a2011-11-04 00:22:48 +01001289 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290 return unicode;
1291}
1292
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293static const char*
1294unicode_kind_name(PyObject *unicode)
1295{
Victor Stinner42dfd712011-10-03 14:41:45 +02001296 /* don't check consistency: unicode_kind_name() is called from
1297 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001298 if (!PyUnicode_IS_COMPACT(unicode))
1299 {
1300 if (!PyUnicode_IS_READY(unicode))
1301 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001302 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001303 {
1304 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001305 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 return "legacy ascii";
1307 else
1308 return "legacy latin1";
1309 case PyUnicode_2BYTE_KIND:
1310 return "legacy UCS2";
1311 case PyUnicode_4BYTE_KIND:
1312 return "legacy UCS4";
1313 default:
1314 return "<legacy invalid kind>";
1315 }
1316 }
1317 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001318 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001319 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001320 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001321 return "ascii";
1322 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001323 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001324 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001325 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001326 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001327 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001328 default:
1329 return "<invalid compact kind>";
1330 }
1331}
1332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001335const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001336 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001337 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338}
1339
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001340const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001341 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 return _PyUnicode_COMPACT_DATA(unicode);
1343}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001344const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001345 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001346 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1348 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1349 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1350 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1351 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1352 return PyUnicode_DATA(unicode);
1353}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001354
1355void
1356_PyUnicode_Dump(PyObject *op)
1357{
1358 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001359 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1360 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001361 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001362
Victor Stinnera849a4b2011-10-03 12:12:11 +02001363 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001364 {
1365 if (ascii->state.ascii)
1366 data = (ascii + 1);
1367 else
1368 data = (compact + 1);
1369 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001370 else
1371 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001372 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001373
Victor Stinnera849a4b2011-10-03 12:12:11 +02001374 if (ascii->wstr == data)
1375 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001376 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001377
Victor Stinnera3b334d2011-10-03 13:53:37 +02001378 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001379 printf(" (%zu), ", compact->wstr_length);
1380 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001381 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001382 }
1383 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001384 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001385 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001386}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387#endif
1388
Victor Stinner91698d82020-06-25 14:07:40 +02001389static int
1390unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1391{
1392 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1393 // optimized to always use state->empty_string without having to check if
1394 // it is NULL or not.
1395 PyObject *empty = PyUnicode_New(1, 0);
1396 if (empty == NULL) {
1397 return -1;
1398 }
1399 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1400 _PyUnicode_LENGTH(empty) = 0;
1401 assert(_PyUnicode_CheckConsistency(empty, 1));
1402
1403 assert(state->empty_string == NULL);
1404 state->empty_string = empty;
1405 return 0;
1406}
1407
1408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409PyObject *
1410PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1411{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001412 /* Optimization for empty strings */
1413 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001414 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001415 }
1416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 PyObject *obj;
1418 PyCompactUnicodeObject *unicode;
1419 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001420 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001421 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 Py_ssize_t char_size;
1423 Py_ssize_t struct_size;
1424
Victor Stinner9e9d6892011-10-04 01:02:02 +02001425 is_ascii = 0;
1426 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 struct_size = sizeof(PyCompactUnicodeObject);
1428 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001429 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 char_size = 1;
1431 is_ascii = 1;
1432 struct_size = sizeof(PyASCIIObject);
1433 }
1434 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001435 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 char_size = 1;
1437 }
1438 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001439 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 char_size = 2;
1441 if (sizeof(wchar_t) == 2)
1442 is_sharing = 1;
1443 }
1444 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001445 if (maxchar > MAX_UNICODE) {
1446 PyErr_SetString(PyExc_SystemError,
1447 "invalid maximum character passed to PyUnicode_New");
1448 return NULL;
1449 }
Victor Stinner8f825062012-04-27 13:55:39 +02001450 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 char_size = 4;
1452 if (sizeof(wchar_t) == 4)
1453 is_sharing = 1;
1454 }
1455
1456 /* Ensure we won't overflow the size. */
1457 if (size < 0) {
1458 PyErr_SetString(PyExc_SystemError,
1459 "Negative size passed to PyUnicode_New");
1460 return NULL;
1461 }
1462 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1463 return PyErr_NoMemory();
1464
1465 /* Duplicated allocation code from _PyObject_New() instead of a call to
1466 * PyObject_New() so we are able to allocate space for the object and
1467 * it's data buffer.
1468 */
1469 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001470 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001472 }
1473 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474
1475 unicode = (PyCompactUnicodeObject *)obj;
1476 if (is_ascii)
1477 data = ((PyASCIIObject*)obj) + 1;
1478 else
1479 data = unicode + 1;
1480 _PyUnicode_LENGTH(unicode) = size;
1481 _PyUnicode_HASH(unicode) = -1;
1482 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001483 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 _PyUnicode_STATE(unicode).compact = 1;
1485 _PyUnicode_STATE(unicode).ready = 1;
1486 _PyUnicode_STATE(unicode).ascii = is_ascii;
1487 if (is_ascii) {
1488 ((char*)data)[size] = 0;
1489 _PyUnicode_WSTR(unicode) = NULL;
1490 }
Victor Stinner8f825062012-04-27 13:55:39 +02001491 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 ((char*)data)[size] = 0;
1493 _PyUnicode_WSTR(unicode) = NULL;
1494 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001496 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 else {
1499 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001500 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001501 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001503 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504 ((Py_UCS4*)data)[size] = 0;
1505 if (is_sharing) {
1506 _PyUnicode_WSTR_LENGTH(unicode) = size;
1507 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1508 }
1509 else {
1510 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1511 _PyUnicode_WSTR(unicode) = NULL;
1512 }
1513 }
Victor Stinner8f825062012-04-27 13:55:39 +02001514#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001515 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001516#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001517 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 return obj;
1519}
1520
1521#if SIZEOF_WCHAR_T == 2
1522/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1523 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001524 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
1526 This function assumes that unicode can hold one more code point than wstr
1527 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001528static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001530 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531{
1532 const wchar_t *iter;
1533 Py_UCS4 *ucs4_out;
1534
Victor Stinner910337b2011-10-03 03:20:16 +02001535 assert(unicode != NULL);
1536 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1538 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1539
1540 for (iter = begin; iter < end; ) {
1541 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1542 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001543 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1544 && (iter+1) < end
1545 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546 {
Victor Stinner551ac952011-11-29 22:58:13 +01001547 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 iter += 2;
1549 }
1550 else {
1551 *ucs4_out++ = *iter;
1552 iter++;
1553 }
1554 }
1555 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1556 _PyUnicode_GET_LENGTH(unicode)));
1557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558}
1559#endif
1560
Victor Stinnercd9950f2011-10-02 00:34:53 +02001561static int
Victor Stinner488fa492011-12-12 00:01:39 +01001562unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001563{
Victor Stinner488fa492011-12-12 00:01:39 +01001564 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001565 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001566 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001567 return -1;
1568 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001569 return 0;
1570}
1571
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001572static int
1573_copy_characters(PyObject *to, Py_ssize_t to_start,
1574 PyObject *from, Py_ssize_t from_start,
1575 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001577 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001578 const void *from_data;
1579 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580
Victor Stinneree4544c2012-05-09 22:24:08 +02001581 assert(0 <= how_many);
1582 assert(0 <= from_start);
1583 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001584 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001585 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001586 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001587
Victor Stinnerd3f08822012-05-29 12:57:52 +02001588 assert(PyUnicode_Check(to));
1589 assert(PyUnicode_IS_READY(to));
1590 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1591
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 if (how_many == 0)
1593 return 0;
1594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001596 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001598 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599
Victor Stinnerf1852262012-06-16 16:38:26 +02001600#ifdef Py_DEBUG
1601 if (!check_maxchar
1602 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1603 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001604 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001605 Py_UCS4 ch;
1606 Py_ssize_t i;
1607 for (i=0; i < how_many; i++) {
1608 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1609 assert(ch <= to_maxchar);
1610 }
1611 }
1612#endif
1613
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001614 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001615 if (check_maxchar
1616 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1617 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001618 /* Writing Latin-1 characters into an ASCII string requires to
1619 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001620 Py_UCS4 max_char;
1621 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001622 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001623 if (max_char >= 128)
1624 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001625 }
Christian Heimesf051e432016-09-13 20:22:02 +02001626 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001627 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001628 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001630 else if (from_kind == PyUnicode_1BYTE_KIND
1631 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001632 {
1633 _PyUnicode_CONVERT_BYTES(
1634 Py_UCS1, Py_UCS2,
1635 PyUnicode_1BYTE_DATA(from) + from_start,
1636 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1637 PyUnicode_2BYTE_DATA(to) + to_start
1638 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001639 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001640 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001641 && to_kind == PyUnicode_4BYTE_KIND)
1642 {
1643 _PyUnicode_CONVERT_BYTES(
1644 Py_UCS1, Py_UCS4,
1645 PyUnicode_1BYTE_DATA(from) + from_start,
1646 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1647 PyUnicode_4BYTE_DATA(to) + to_start
1648 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001649 }
1650 else if (from_kind == PyUnicode_2BYTE_KIND
1651 && to_kind == PyUnicode_4BYTE_KIND)
1652 {
1653 _PyUnicode_CONVERT_BYTES(
1654 Py_UCS2, Py_UCS4,
1655 PyUnicode_2BYTE_DATA(from) + from_start,
1656 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1657 PyUnicode_4BYTE_DATA(to) + to_start
1658 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001659 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001660 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001661 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1662
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001663 if (!check_maxchar) {
1664 if (from_kind == PyUnicode_2BYTE_KIND
1665 && to_kind == PyUnicode_1BYTE_KIND)
1666 {
1667 _PyUnicode_CONVERT_BYTES(
1668 Py_UCS2, Py_UCS1,
1669 PyUnicode_2BYTE_DATA(from) + from_start,
1670 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1671 PyUnicode_1BYTE_DATA(to) + to_start
1672 );
1673 }
1674 else if (from_kind == PyUnicode_4BYTE_KIND
1675 && to_kind == PyUnicode_1BYTE_KIND)
1676 {
1677 _PyUnicode_CONVERT_BYTES(
1678 Py_UCS4, Py_UCS1,
1679 PyUnicode_4BYTE_DATA(from) + from_start,
1680 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1681 PyUnicode_1BYTE_DATA(to) + to_start
1682 );
1683 }
1684 else if (from_kind == PyUnicode_4BYTE_KIND
1685 && to_kind == PyUnicode_2BYTE_KIND)
1686 {
1687 _PyUnicode_CONVERT_BYTES(
1688 Py_UCS4, Py_UCS2,
1689 PyUnicode_4BYTE_DATA(from) + from_start,
1690 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1691 PyUnicode_2BYTE_DATA(to) + to_start
1692 );
1693 }
1694 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001695 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001696 }
1697 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001698 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001699 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001700 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001701 Py_ssize_t i;
1702
Victor Stinnera0702ab2011-09-29 14:14:38 +02001703 for (i=0; i < how_many; i++) {
1704 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001705 if (ch > to_maxchar)
1706 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001707 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1708 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001709 }
1710 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001711 return 0;
1712}
1713
Victor Stinnerd3f08822012-05-29 12:57:52 +02001714void
1715_PyUnicode_FastCopyCharacters(
1716 PyObject *to, Py_ssize_t to_start,
1717 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001718{
1719 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1720}
1721
1722Py_ssize_t
1723PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1724 PyObject *from, Py_ssize_t from_start,
1725 Py_ssize_t how_many)
1726{
1727 int err;
1728
1729 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1730 PyErr_BadInternalCall();
1731 return -1;
1732 }
1733
Benjamin Petersonbac79492012-01-14 13:34:47 -05001734 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001735 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001736 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001737 return -1;
1738
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001739 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001740 PyErr_SetString(PyExc_IndexError, "string index out of range");
1741 return -1;
1742 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001743 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001744 PyErr_SetString(PyExc_IndexError, "string index out of range");
1745 return -1;
1746 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001747 if (how_many < 0) {
1748 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1749 return -1;
1750 }
1751 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001752 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1753 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001754 "Cannot write %zi characters at %zi "
1755 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001756 how_many, to_start, PyUnicode_GET_LENGTH(to));
1757 return -1;
1758 }
1759
1760 if (how_many == 0)
1761 return 0;
1762
Victor Stinner488fa492011-12-12 00:01:39 +01001763 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001764 return -1;
1765
1766 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1767 if (err) {
1768 PyErr_Format(PyExc_SystemError,
1769 "Cannot copy %s characters "
1770 "into a string of %s characters",
1771 unicode_kind_name(from),
1772 unicode_kind_name(to));
1773 return -1;
1774 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001775 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776}
1777
Victor Stinner17222162011-09-28 22:15:37 +02001778/* Find the maximum code point and count the number of surrogate pairs so a
1779 correct string length can be computed before converting a string to UCS4.
1780 This function counts single surrogates as a character and not as a pair.
1781
1782 Return 0 on success, or -1 on error. */
1783static int
1784find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1785 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786{
1787 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001788 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Victor Stinnerc53be962011-10-02 21:33:54 +02001790 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 *num_surrogates = 0;
1792 *maxchar = 0;
1793
1794 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001796 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1797 && (iter+1) < end
1798 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1799 {
1800 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1801 ++(*num_surrogates);
1802 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803 }
1804 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001806 {
1807 ch = *iter;
1808 iter++;
1809 }
1810 if (ch > *maxchar) {
1811 *maxchar = ch;
1812 if (*maxchar > MAX_UNICODE) {
1813 PyErr_Format(PyExc_ValueError,
1814 "character U+%x is not in range [U+0000; U+10ffff]",
1815 ch);
1816 return -1;
1817 }
1818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 }
1820 return 0;
1821}
1822
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001823int
1824_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825{
1826 wchar_t *end;
1827 Py_UCS4 maxchar = 0;
1828 Py_ssize_t num_surrogates;
1829#if SIZEOF_WCHAR_T == 2
1830 Py_ssize_t length_wo_surrogates;
1831#endif
1832
Georg Brandl7597add2011-10-05 16:36:47 +02001833 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001834 strings were created using _PyObject_New() and where no canonical
1835 representation (the str field) has been set yet aka strings
1836 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001837 assert(_PyUnicode_CHECK(unicode));
1838 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001840 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001841 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001842 /* Actually, it should neither be interned nor be anything else: */
1843 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001846 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001847 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849
1850 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001851 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1852 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001853 PyErr_NoMemory();
1854 return -1;
1855 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001856 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 _PyUnicode_WSTR(unicode), end,
1858 PyUnicode_1BYTE_DATA(unicode));
1859 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1860 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1861 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1862 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001863 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001864 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001865 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 }
1867 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001868 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001869 _PyUnicode_UTF8(unicode) = NULL;
1870 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 }
1872 PyObject_FREE(_PyUnicode_WSTR(unicode));
1873 _PyUnicode_WSTR(unicode) = NULL;
1874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1875 }
1876 /* In this case we might have to convert down from 4-byte native
1877 wchar_t to 2-byte unicode. */
1878 else if (maxchar < 65536) {
1879 assert(num_surrogates == 0 &&
1880 "FindMaxCharAndNumSurrogatePairs() messed up");
1881
Victor Stinner506f5922011-09-28 22:34:18 +02001882#if SIZEOF_WCHAR_T == 2
1883 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001884 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001885 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1886 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1887 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001888 _PyUnicode_UTF8(unicode) = NULL;
1889 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001890#else
1891 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001892 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001893 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001894 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001895 PyErr_NoMemory();
1896 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 }
Victor Stinner506f5922011-09-28 22:34:18 +02001898 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1899 _PyUnicode_WSTR(unicode), end,
1900 PyUnicode_2BYTE_DATA(unicode));
1901 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1902 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1903 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001904 _PyUnicode_UTF8(unicode) = NULL;
1905 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001906 PyObject_FREE(_PyUnicode_WSTR(unicode));
1907 _PyUnicode_WSTR(unicode) = NULL;
1908 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1909#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 }
1911 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1912 else {
1913#if SIZEOF_WCHAR_T == 2
1914 /* in case the native representation is 2-bytes, we need to allocate a
1915 new normalized 4-byte version. */
1916 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001917 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1918 PyErr_NoMemory();
1919 return -1;
1920 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001921 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1922 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 PyErr_NoMemory();
1924 return -1;
1925 }
1926 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1927 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001928 _PyUnicode_UTF8(unicode) = NULL;
1929 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001930 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1931 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001932 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001933 PyObject_FREE(_PyUnicode_WSTR(unicode));
1934 _PyUnicode_WSTR(unicode) = NULL;
1935 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1936#else
1937 assert(num_surrogates == 0);
1938
Victor Stinnerc3c74152011-10-02 20:39:55 +02001939 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001941 _PyUnicode_UTF8(unicode) = NULL;
1942 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1944#endif
1945 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1946 }
1947 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001948 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 return 0;
1950}
1951
Alexander Belopolsky40018472011-02-26 01:02:56 +00001952static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001953unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954{
Walter Dörwald16807132007-05-25 13:52:07 +00001955 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 case SSTATE_NOT_INTERNED:
1957 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001958
Benjamin Peterson29060642009-01-31 22:14:21 +00001959 case SSTATE_INTERNED_MORTAL:
1960 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001961 Py_SET_REFCNT(unicode, 3);
Victor Stinner607b1022020-05-05 18:50:30 +02001962#ifdef INTERNED_STRINGS
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001963 if (PyDict_DelItem(interned, unicode) != 0) {
1964 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1965 NULL);
1966 }
Victor Stinner607b1022020-05-05 18:50:30 +02001967#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00001968 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001969
Benjamin Peterson29060642009-01-31 22:14:21 +00001970 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001971 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1972 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001973
Benjamin Peterson29060642009-01-31 22:14:21 +00001974 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001975 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001976 }
1977
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001978 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001980 }
1981 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001982 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001983 }
1984 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001985 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001988 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989}
1990
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001991#ifdef Py_DEBUG
1992static int
1993unicode_is_singleton(PyObject *unicode)
1994{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001995 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001996 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001997 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001998 }
Victor Stinner607b1022020-05-05 18:50:30 +02001999 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002000 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
2001 {
2002 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002003 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002004 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02002005 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002006 }
2007 return 0;
2008}
2009#endif
2010
Alexander Belopolsky40018472011-02-26 01:02:56 +00002011static int
Victor Stinner488fa492011-12-12 00:01:39 +01002012unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002013{
Victor Stinner488fa492011-12-12 00:01:39 +01002014 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002015 if (Py_REFCNT(unicode) != 1)
2016 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002017 if (_PyUnicode_HASH(unicode) != -1)
2018 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 if (PyUnicode_CHECK_INTERNED(unicode))
2020 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002021 if (!PyUnicode_CheckExact(unicode))
2022 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002023#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002024 /* singleton refcount is greater than 1 */
2025 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002026#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002027 return 1;
2028}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002029
Victor Stinnerfe226c02011-10-03 03:52:20 +02002030static int
2031unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2032{
2033 PyObject *unicode;
2034 Py_ssize_t old_length;
2035
2036 assert(p_unicode != NULL);
2037 unicode = *p_unicode;
2038
2039 assert(unicode != NULL);
2040 assert(PyUnicode_Check(unicode));
2041 assert(0 <= length);
2042
Victor Stinner910337b2011-10-03 03:20:16 +02002043 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002044 old_length = PyUnicode_WSTR_LENGTH(unicode);
2045 else
2046 old_length = PyUnicode_GET_LENGTH(unicode);
2047 if (old_length == length)
2048 return 0;
2049
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002050 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002051 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002052 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002053 return 0;
2054 }
2055
Victor Stinner488fa492011-12-12 00:01:39 +01002056 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002057 PyObject *copy = resize_copy(unicode, length);
2058 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002060 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002061 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002062 }
2063
Victor Stinnerfe226c02011-10-03 03:52:20 +02002064 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002065 PyObject *new_unicode = resize_compact(unicode, length);
2066 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002067 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002068 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002069 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002070 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002071 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002072}
2073
Alexander Belopolsky40018472011-02-26 01:02:56 +00002074int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002075PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002076{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002077 PyObject *unicode;
2078 if (p_unicode == NULL) {
2079 PyErr_BadInternalCall();
2080 return -1;
2081 }
2082 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002083 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002084 {
2085 PyErr_BadInternalCall();
2086 return -1;
2087 }
2088 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002089}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002090
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002091/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002092
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002093 WARNING: The function doesn't copy the terminating null character and
2094 doesn't check the maximum character (may write a latin1 character in an
2095 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002096static void
2097unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2098 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002099{
2100 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002101 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002102 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002104 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002105 switch (kind) {
2106 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002107#ifdef Py_DEBUG
2108 if (PyUnicode_IS_ASCII(unicode)) {
2109 Py_UCS4 maxchar = ucs1lib_find_max_char(
2110 (const Py_UCS1*)str,
2111 (const Py_UCS1*)str + len);
2112 assert(maxchar < 128);
2113 }
2114#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002115 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002116 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117 }
2118 case PyUnicode_2BYTE_KIND: {
2119 Py_UCS2 *start = (Py_UCS2 *)data + index;
2120 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002121
Victor Stinner184252a2012-06-16 02:57:41 +02002122 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 *ucs2 = (Py_UCS2)*str;
2124
2125 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002126 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002127 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002128 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002129 Py_UCS4 *start = (Py_UCS4 *)data + index;
2130 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002131
Victor Stinner184252a2012-06-16 02:57:41 +02002132 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002133 *ucs4 = (Py_UCS4)*str;
2134
2135 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002136 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002137 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002138 default:
2139 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002140 }
2141}
2142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002144get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002146 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002147
Victor Stinner2f9ada92020-06-24 02:22:21 +02002148 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002149 if (unicode) {
2150 Py_INCREF(unicode);
2151 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 }
Victor Stinner607b1022020-05-05 18:50:30 +02002153
2154 unicode = PyUnicode_New(1, ch);
2155 if (!unicode) {
2156 return NULL;
2157 }
2158
2159 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2160 assert(_PyUnicode_CheckConsistency(unicode, 1));
2161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002163 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002164 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165}
2166
Victor Stinner985a82a2014-01-03 12:53:47 +01002167static PyObject*
2168unicode_char(Py_UCS4 ch)
2169{
2170 PyObject *unicode;
2171
2172 assert(ch <= MAX_UNICODE);
2173
Victor Stinner2f9ada92020-06-24 02:22:21 +02002174 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002175 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002176 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002177
Victor Stinner985a82a2014-01-03 12:53:47 +01002178 unicode = PyUnicode_New(1, ch);
2179 if (unicode == NULL)
2180 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002181
2182 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2183 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002184 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002185 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002186 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2187 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2188 }
2189 assert(_PyUnicode_CheckConsistency(unicode, 1));
2190 return unicode;
2191}
2192
Alexander Belopolsky40018472011-02-26 01:02:56 +00002193PyObject *
2194PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002196 if (u == NULL)
2197 return (PyObject*)_PyUnicode_New(size);
2198
2199 if (size < 0) {
2200 PyErr_BadInternalCall();
2201 return NULL;
2202 }
2203
2204 return PyUnicode_FromWideChar(u, size);
2205}
2206
2207PyObject *
2208PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2209{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002210 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 Py_UCS4 maxchar = 0;
2212 Py_ssize_t num_surrogates;
2213
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002214 if (u == NULL && size != 0) {
2215 PyErr_BadInternalCall();
2216 return NULL;
2217 }
2218
2219 if (size == -1) {
2220 size = wcslen(u);
2221 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002223 /* If the Unicode data is known at construction time, we can apply
2224 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002227 if (size == 0)
2228 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 /* Single character Unicode objects in the Latin-1 range are
2231 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002232 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 return get_latin1_char((unsigned char)*u);
2234
2235 /* If not empty and not single character, copy the Unicode data
2236 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002237 if (find_maxchar_surrogates(u, u + size,
2238 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 return NULL;
2240
Victor Stinner8faf8212011-12-08 22:14:11 +01002241 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002242 if (!unicode)
2243 return NULL;
2244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 switch (PyUnicode_KIND(unicode)) {
2246 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002247 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2249 break;
2250 case PyUnicode_2BYTE_KIND:
2251#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002252 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002254 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2256#endif
2257 break;
2258 case PyUnicode_4BYTE_KIND:
2259#if SIZEOF_WCHAR_T == 2
2260 /* This is the only case which has to process surrogates, thus
2261 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002262 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263#else
2264 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002265 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266#endif
2267 break;
2268 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002269 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002272 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273}
2274
Alexander Belopolsky40018472011-02-26 01:02:56 +00002275PyObject *
2276PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002277{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 if (size < 0) {
2279 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002280 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 return NULL;
2282 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002283 if (u != NULL)
2284 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2285 else
2286 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002287}
2288
Alexander Belopolsky40018472011-02-26 01:02:56 +00002289PyObject *
2290PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002291{
2292 size_t size = strlen(u);
2293 if (size > PY_SSIZE_T_MAX) {
2294 PyErr_SetString(PyExc_OverflowError, "input too long");
2295 return NULL;
2296 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002297 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002298}
2299
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002300PyObject *
2301_PyUnicode_FromId(_Py_Identifier *id)
2302{
Victor Stinner297257f2020-06-02 14:39:45 +02002303 if (id->object) {
2304 return id->object;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002305 }
Victor Stinner297257f2020-06-02 14:39:45 +02002306
2307 PyObject *obj;
2308 obj = PyUnicode_DecodeUTF8Stateful(id->string,
2309 strlen(id->string),
2310 NULL, NULL);
2311 if (!obj) {
2312 return NULL;
2313 }
2314 PyUnicode_InternInPlace(&obj);
2315
2316 assert(!id->next);
2317 id->object = obj;
2318 id->next = static_strings;
2319 static_strings = id;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002320 return id->object;
2321}
2322
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002323static void
2324unicode_clear_static_strings(void)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002325{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002326 _Py_Identifier *tmp, *s = static_strings;
2327 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002328 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002329 tmp = s->next;
2330 s->next = NULL;
2331 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002332 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002333 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002334}
2335
Benjamin Peterson0df54292012-03-26 14:50:32 -04002336/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002337
Victor Stinnerd3f08822012-05-29 12:57:52 +02002338PyObject*
2339_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002340{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002341 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002342 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002343 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002344#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002345 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002346#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002347 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002348 }
Victor Stinner785938e2011-12-11 20:09:03 +01002349 unicode = PyUnicode_New(size, 127);
2350 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002351 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002352 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2353 assert(_PyUnicode_CheckConsistency(unicode, 1));
2354 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002355}
2356
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002357static Py_UCS4
2358kind_maxchar_limit(unsigned int kind)
2359{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002360 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002361 case PyUnicode_1BYTE_KIND:
2362 return 0x80;
2363 case PyUnicode_2BYTE_KIND:
2364 return 0x100;
2365 case PyUnicode_4BYTE_KIND:
2366 return 0x10000;
2367 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002368 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002369 }
2370}
2371
Victor Stinner702c7342011-10-05 13:50:52 +02002372static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002373_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002376 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002377
Victor Stinner2f9ada92020-06-24 02:22:21 +02002378 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002379 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002380 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002381 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002382 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002383 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002384 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002385
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002386 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002387 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 if (!res)
2389 return NULL;
2390 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002391 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002393}
2394
Victor Stinnere57b1c02011-09-28 22:20:48 +02002395static PyObject*
2396_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397{
2398 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002399 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002400
Serhiy Storchaka678db842013-01-26 12:16:36 +02002401 if (size == 0)
2402 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002403 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002404 if (size == 1)
2405 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002406
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002407 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002408 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 if (!res)
2410 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002411 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002413 else {
2414 _PyUnicode_CONVERT_BYTES(
2415 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2416 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002417 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 return res;
2419}
2420
Victor Stinnere57b1c02011-09-28 22:20:48 +02002421static PyObject*
2422_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423{
2424 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002425 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002426
Serhiy Storchaka678db842013-01-26 12:16:36 +02002427 if (size == 0)
2428 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002429 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002430 if (size == 1)
2431 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002432
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002433 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002434 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 if (!res)
2436 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002437 if (max_char < 256)
2438 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2439 PyUnicode_1BYTE_DATA(res));
2440 else if (max_char < 0x10000)
2441 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2442 PyUnicode_2BYTE_DATA(res));
2443 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002445 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 return res;
2447}
2448
2449PyObject*
2450PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2451{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002452 if (size < 0) {
2453 PyErr_SetString(PyExc_ValueError, "size must be positive");
2454 return NULL;
2455 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002456 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002458 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002460 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002462 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002463 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002464 PyErr_SetString(PyExc_SystemError, "invalid kind");
2465 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467}
2468
Victor Stinnerece58de2012-04-23 23:36:38 +02002469Py_UCS4
2470_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2471{
2472 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002473 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002474
2475 assert(PyUnicode_IS_READY(unicode));
2476 assert(0 <= start);
2477 assert(end <= PyUnicode_GET_LENGTH(unicode));
2478 assert(start <= end);
2479
2480 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2481 return PyUnicode_MAX_CHAR_VALUE(unicode);
2482
2483 if (start == end)
2484 return 127;
2485
Victor Stinner94d558b2012-04-27 22:26:58 +02002486 if (PyUnicode_IS_ASCII(unicode))
2487 return 127;
2488
Victor Stinnerece58de2012-04-23 23:36:38 +02002489 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002490 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002491 endptr = (char *)startptr + end * kind;
2492 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002493 switch(kind) {
2494 case PyUnicode_1BYTE_KIND:
2495 return ucs1lib_find_max_char(startptr, endptr);
2496 case PyUnicode_2BYTE_KIND:
2497 return ucs2lib_find_max_char(startptr, endptr);
2498 case PyUnicode_4BYTE_KIND:
2499 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002500 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002501 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002502 }
2503}
2504
Victor Stinner25a4b292011-10-06 12:31:55 +02002505/* Ensure that a string uses the most efficient storage, if it is not the
2506 case: create a new string with of the right kind. Write NULL into *p_unicode
2507 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002508static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002509unicode_adjust_maxchar(PyObject **p_unicode)
2510{
2511 PyObject *unicode, *copy;
2512 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002513 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002514 unsigned int kind;
2515
2516 assert(p_unicode != NULL);
2517 unicode = *p_unicode;
2518 assert(PyUnicode_IS_READY(unicode));
2519 if (PyUnicode_IS_ASCII(unicode))
2520 return;
2521
2522 len = PyUnicode_GET_LENGTH(unicode);
2523 kind = PyUnicode_KIND(unicode);
2524 if (kind == PyUnicode_1BYTE_KIND) {
2525 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002526 max_char = ucs1lib_find_max_char(u, u + len);
2527 if (max_char >= 128)
2528 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002529 }
2530 else if (kind == PyUnicode_2BYTE_KIND) {
2531 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002532 max_char = ucs2lib_find_max_char(u, u + len);
2533 if (max_char >= 256)
2534 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002535 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002536 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002537 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002538 max_char = ucs4lib_find_max_char(u, u + len);
2539 if (max_char >= 0x10000)
2540 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002541 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002542 else
2543 Py_UNREACHABLE();
2544
Victor Stinner25a4b292011-10-06 12:31:55 +02002545 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002546 if (copy != NULL)
2547 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002548 Py_DECREF(unicode);
2549 *p_unicode = copy;
2550}
2551
Victor Stinner034f6cf2011-09-30 02:26:44 +02002552PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002553_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002554{
Victor Stinner87af4f22011-11-21 23:03:47 +01002555 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002556 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002557
Victor Stinner034f6cf2011-09-30 02:26:44 +02002558 if (!PyUnicode_Check(unicode)) {
2559 PyErr_BadInternalCall();
2560 return NULL;
2561 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002562 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002563 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002564
Victor Stinner87af4f22011-11-21 23:03:47 +01002565 length = PyUnicode_GET_LENGTH(unicode);
2566 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002567 if (!copy)
2568 return NULL;
2569 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2570
Christian Heimesf051e432016-09-13 20:22:02 +02002571 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002572 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002573 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002574 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002575}
2576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577
Victor Stinnerbc603d12011-10-02 01:00:40 +02002578/* Widen Unicode objects to larger buffers. Don't write terminating null
2579 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002581static void*
2582unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002583{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002584 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002585
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002586 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002587 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002588 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002589 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002590 if (!result)
2591 return PyErr_NoMemory();
2592 assert(skind == PyUnicode_1BYTE_KIND);
2593 _PyUnicode_CONVERT_BYTES(
2594 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002595 (const Py_UCS1 *)data,
2596 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002597 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002599 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002600 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002601 if (!result)
2602 return PyErr_NoMemory();
2603 if (skind == PyUnicode_2BYTE_KIND) {
2604 _PyUnicode_CONVERT_BYTES(
2605 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002606 (const Py_UCS2 *)data,
2607 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002608 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002610 else {
2611 assert(skind == PyUnicode_1BYTE_KIND);
2612 _PyUnicode_CONVERT_BYTES(
2613 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002614 (const Py_UCS1 *)data,
2615 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002616 result);
2617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002619 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002620 Py_UNREACHABLE();
2621 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623}
2624
2625static Py_UCS4*
2626as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2627 int copy_null)
2628{
2629 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002630 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 Py_ssize_t len, targetlen;
2632 if (PyUnicode_READY(string) == -1)
2633 return NULL;
2634 kind = PyUnicode_KIND(string);
2635 data = PyUnicode_DATA(string);
2636 len = PyUnicode_GET_LENGTH(string);
2637 targetlen = len;
2638 if (copy_null)
2639 targetlen++;
2640 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002641 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 if (!target) {
2643 PyErr_NoMemory();
2644 return NULL;
2645 }
2646 }
2647 else {
2648 if (targetsize < targetlen) {
2649 PyErr_Format(PyExc_SystemError,
2650 "string is longer than the buffer");
2651 if (copy_null && 0 < targetsize)
2652 target[0] = 0;
2653 return NULL;
2654 }
2655 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002656 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002657 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002658 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002660 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002661 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002662 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2663 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002664 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002665 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002666 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002667 else {
2668 Py_UNREACHABLE();
2669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 if (copy_null)
2671 target[len] = 0;
2672 return target;
2673}
2674
2675Py_UCS4*
2676PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2677 int copy_null)
2678{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002679 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 PyErr_BadInternalCall();
2681 return NULL;
2682 }
2683 return as_ucs4(string, target, targetsize, copy_null);
2684}
2685
2686Py_UCS4*
2687PyUnicode_AsUCS4Copy(PyObject *string)
2688{
2689 return as_ucs4(string, NULL, 0, 1);
2690}
2691
Victor Stinner15a11362012-10-06 23:48:20 +02002692/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002693 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2694 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2695#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002696
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002697static int
2698unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2699 Py_ssize_t width, Py_ssize_t precision)
2700{
2701 Py_ssize_t length, fill, arglen;
2702 Py_UCS4 maxchar;
2703
2704 if (PyUnicode_READY(str) == -1)
2705 return -1;
2706
2707 length = PyUnicode_GET_LENGTH(str);
2708 if ((precision == -1 || precision >= length)
2709 && width <= length)
2710 return _PyUnicodeWriter_WriteStr(writer, str);
2711
2712 if (precision != -1)
2713 length = Py_MIN(precision, length);
2714
2715 arglen = Py_MAX(length, width);
2716 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2717 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2718 else
2719 maxchar = writer->maxchar;
2720
2721 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2722 return -1;
2723
2724 if (width > length) {
2725 fill = width - length;
2726 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2727 return -1;
2728 writer->pos += fill;
2729 }
2730
2731 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2732 str, 0, length);
2733 writer->pos += length;
2734 return 0;
2735}
2736
2737static int
Victor Stinner998b8062018-09-12 00:23:25 +02002738unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002739 Py_ssize_t width, Py_ssize_t precision)
2740{
2741 /* UTF-8 */
2742 Py_ssize_t length;
2743 PyObject *unicode;
2744 int res;
2745
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002746 if (precision == -1) {
2747 length = strlen(str);
2748 }
2749 else {
2750 length = 0;
2751 while (length < precision && str[length]) {
2752 length++;
2753 }
2754 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002755 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2756 if (unicode == NULL)
2757 return -1;
2758
2759 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2760 Py_DECREF(unicode);
2761 return res;
2762}
2763
Victor Stinner96865452011-03-01 23:44:09 +00002764static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002765unicode_fromformat_arg(_PyUnicodeWriter *writer,
2766 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002767{
Victor Stinnere215d962012-10-06 23:03:36 +02002768 const char *p;
2769 Py_ssize_t len;
2770 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002771 Py_ssize_t width;
2772 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002773 int longflag;
2774 int longlongflag;
2775 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002776 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002777
2778 p = f;
2779 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002780 zeropad = 0;
2781 if (*f == '0') {
2782 zeropad = 1;
2783 f++;
2784 }
Victor Stinner96865452011-03-01 23:44:09 +00002785
2786 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002787 width = -1;
2788 if (Py_ISDIGIT((unsigned)*f)) {
2789 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002790 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002791 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002792 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002793 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002794 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002795 return NULL;
2796 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002797 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002798 f++;
2799 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002800 }
2801 precision = -1;
2802 if (*f == '.') {
2803 f++;
2804 if (Py_ISDIGIT((unsigned)*f)) {
2805 precision = (*f - '0');
2806 f++;
2807 while (Py_ISDIGIT((unsigned)*f)) {
2808 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2809 PyErr_SetString(PyExc_ValueError,
2810 "precision too big");
2811 return NULL;
2812 }
2813 precision = (precision * 10) + (*f - '0');
2814 f++;
2815 }
2816 }
Victor Stinner96865452011-03-01 23:44:09 +00002817 if (*f == '%') {
2818 /* "%.3%s" => f points to "3" */
2819 f--;
2820 }
2821 }
2822 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002823 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002824 f--;
2825 }
Victor Stinner96865452011-03-01 23:44:09 +00002826
2827 /* Handle %ld, %lu, %lld and %llu. */
2828 longflag = 0;
2829 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002830 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002831 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002832 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002833 longflag = 1;
2834 ++f;
2835 }
Victor Stinner96865452011-03-01 23:44:09 +00002836 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002837 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002838 longlongflag = 1;
2839 f += 2;
2840 }
Victor Stinner96865452011-03-01 23:44:09 +00002841 }
2842 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002843 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002844 size_tflag = 1;
2845 ++f;
2846 }
Victor Stinnere215d962012-10-06 23:03:36 +02002847
2848 if (f[1] == '\0')
2849 writer->overallocate = 0;
2850
2851 switch (*f) {
2852 case 'c':
2853 {
2854 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002855 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002856 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002857 "character argument not in range(0x110000)");
2858 return NULL;
2859 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002860 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002862 break;
2863 }
2864
2865 case 'i':
2866 case 'd':
2867 case 'u':
2868 case 'x':
2869 {
2870 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002871 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002872 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002873
2874 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002875 if (longflag) {
2876 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2877 }
2878 else if (longlongflag) {
2879 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2880 }
2881 else if (size_tflag) {
2882 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2883 }
2884 else {
2885 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2886 }
Victor Stinnere215d962012-10-06 23:03:36 +02002887 }
2888 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002889 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002890 }
2891 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002892 if (longflag) {
2893 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2894 }
2895 else if (longlongflag) {
2896 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2897 }
2898 else if (size_tflag) {
2899 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2900 }
2901 else {
2902 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2903 }
Victor Stinnere215d962012-10-06 23:03:36 +02002904 }
2905 assert(len >= 0);
2906
Victor Stinnere215d962012-10-06 23:03:36 +02002907 if (precision < len)
2908 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002909
2910 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002911 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2912 return NULL;
2913
Victor Stinnere215d962012-10-06 23:03:36 +02002914 if (width > precision) {
2915 Py_UCS4 fillchar;
2916 fill = width - precision;
2917 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002918 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2919 return NULL;
2920 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 }
Victor Stinner15a11362012-10-06 23:48:20 +02002922 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002923 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002924 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2925 return NULL;
2926 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002927 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002928
Victor Stinner4a587072013-11-19 12:54:53 +01002929 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2930 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002931 break;
2932 }
2933
2934 case 'p':
2935 {
2936 char number[MAX_LONG_LONG_CHARS];
2937
2938 len = sprintf(number, "%p", va_arg(*vargs, void*));
2939 assert(len >= 0);
2940
2941 /* %p is ill-defined: ensure leading 0x. */
2942 if (number[1] == 'X')
2943 number[1] = 'x';
2944 else if (number[1] != 'x') {
2945 memmove(number + 2, number,
2946 strlen(number) + 1);
2947 number[0] = '0';
2948 number[1] = 'x';
2949 len += 2;
2950 }
2951
Victor Stinner4a587072013-11-19 12:54:53 +01002952 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002953 return NULL;
2954 break;
2955 }
2956
2957 case 's':
2958 {
2959 /* UTF-8 */
2960 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002961 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002962 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002963 break;
2964 }
2965
2966 case 'U':
2967 {
2968 PyObject *obj = va_arg(*vargs, PyObject *);
2969 assert(obj && _PyUnicode_CHECK(obj));
2970
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002971 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002972 return NULL;
2973 break;
2974 }
2975
2976 case 'V':
2977 {
2978 PyObject *obj = va_arg(*vargs, PyObject *);
2979 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002980 if (obj) {
2981 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002982 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002983 return NULL;
2984 }
2985 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002986 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002987 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002988 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002989 }
2990 break;
2991 }
2992
2993 case 'S':
2994 {
2995 PyObject *obj = va_arg(*vargs, PyObject *);
2996 PyObject *str;
2997 assert(obj);
2998 str = PyObject_Str(obj);
2999 if (!str)
3000 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003001 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003002 Py_DECREF(str);
3003 return NULL;
3004 }
3005 Py_DECREF(str);
3006 break;
3007 }
3008
3009 case 'R':
3010 {
3011 PyObject *obj = va_arg(*vargs, PyObject *);
3012 PyObject *repr;
3013 assert(obj);
3014 repr = PyObject_Repr(obj);
3015 if (!repr)
3016 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003017 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003018 Py_DECREF(repr);
3019 return NULL;
3020 }
3021 Py_DECREF(repr);
3022 break;
3023 }
3024
3025 case 'A':
3026 {
3027 PyObject *obj = va_arg(*vargs, PyObject *);
3028 PyObject *ascii;
3029 assert(obj);
3030 ascii = PyObject_ASCII(obj);
3031 if (!ascii)
3032 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003033 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003034 Py_DECREF(ascii);
3035 return NULL;
3036 }
3037 Py_DECREF(ascii);
3038 break;
3039 }
3040
3041 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003042 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003043 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003044 break;
3045
3046 default:
3047 /* if we stumble upon an unknown formatting code, copy the rest
3048 of the format string to the output string. (we cannot just
3049 skip the code, since there's no way to know what's in the
3050 argument list) */
3051 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003052 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003053 return NULL;
3054 f = p+len;
3055 return f;
3056 }
3057
3058 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003059 return f;
3060}
3061
Walter Dörwaldd2034312007-05-18 16:29:38 +00003062PyObject *
3063PyUnicode_FromFormatV(const char *format, va_list vargs)
3064{
Victor Stinnere215d962012-10-06 23:03:36 +02003065 va_list vargs2;
3066 const char *f;
3067 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003068
Victor Stinner8f674cc2013-04-17 23:02:17 +02003069 _PyUnicodeWriter_Init(&writer);
3070 writer.min_length = strlen(format) + 100;
3071 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003072
Benjamin Peterson0c212142016-09-20 20:39:33 -07003073 // Copy varags to be able to pass a reference to a subfunction.
3074 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003075
3076 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003077 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003078 f = unicode_fromformat_arg(&writer, f, &vargs2);
3079 if (f == NULL)
3080 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003083 const char *p;
3084 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003085
Victor Stinnere215d962012-10-06 23:03:36 +02003086 p = f;
3087 do
3088 {
3089 if ((unsigned char)*p > 127) {
3090 PyErr_Format(PyExc_ValueError,
3091 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3092 "string, got a non-ASCII byte: 0x%02x",
3093 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003094 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003095 }
3096 p++;
3097 }
3098 while (*p != '\0' && *p != '%');
3099 len = p - f;
3100
3101 if (*p == '\0')
3102 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003103
3104 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003105 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003106
3107 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003108 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003109 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003110 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003111 return _PyUnicodeWriter_Finish(&writer);
3112
3113 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003114 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003115 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003116 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003117}
3118
Walter Dörwaldd2034312007-05-18 16:29:38 +00003119PyObject *
3120PyUnicode_FromFormat(const char *format, ...)
3121{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003122 PyObject* ret;
3123 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003124
3125#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003126 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003128 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003129#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003130 ret = PyUnicode_FromFormatV(format, vargs);
3131 va_end(vargs);
3132 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003133}
3134
Serhiy Storchakac46db922018-10-23 22:58:24 +03003135static Py_ssize_t
3136unicode_get_widechar_size(PyObject *unicode)
3137{
3138 Py_ssize_t res;
3139
3140 assert(unicode != NULL);
3141 assert(_PyUnicode_CHECK(unicode));
3142
3143 if (_PyUnicode_WSTR(unicode) != NULL) {
3144 return PyUnicode_WSTR_LENGTH(unicode);
3145 }
3146 assert(PyUnicode_IS_READY(unicode));
3147
3148 res = _PyUnicode_LENGTH(unicode);
3149#if SIZEOF_WCHAR_T == 2
3150 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3151 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3152 const Py_UCS4 *end = s + res;
3153 for (; s < end; ++s) {
3154 if (*s > 0xFFFF) {
3155 ++res;
3156 }
3157 }
3158 }
3159#endif
3160 return res;
3161}
3162
3163static void
3164unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3165{
3166 const wchar_t *wstr;
3167
3168 assert(unicode != NULL);
3169 assert(_PyUnicode_CHECK(unicode));
3170
3171 wstr = _PyUnicode_WSTR(unicode);
3172 if (wstr != NULL) {
3173 memcpy(w, wstr, size * sizeof(wchar_t));
3174 return;
3175 }
3176 assert(PyUnicode_IS_READY(unicode));
3177
3178 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3179 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3180 for (; size--; ++s, ++w) {
3181 *w = *s;
3182 }
3183 }
3184 else {
3185#if SIZEOF_WCHAR_T == 4
3186 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3187 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3188 for (; size--; ++s, ++w) {
3189 *w = *s;
3190 }
3191#else
3192 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3193 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3194 for (; size--; ++s, ++w) {
3195 Py_UCS4 ch = *s;
3196 if (ch > 0xFFFF) {
3197 assert(ch <= MAX_UNICODE);
3198 /* encode surrogate pair in this case */
3199 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3200 if (!size--)
3201 break;
3202 *w = Py_UNICODE_LOW_SURROGATE(ch);
3203 }
3204 else {
3205 *w = ch;
3206 }
3207 }
3208#endif
3209 }
3210}
3211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212#ifdef HAVE_WCHAR_H
3213
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003214/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003215
Victor Stinnerd88d9832011-09-06 02:00:05 +02003216 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003217 character) required to convert the unicode object. Ignore size argument.
3218
Victor Stinnerd88d9832011-09-06 02:00:05 +02003219 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003220 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003221 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003222Py_ssize_t
3223PyUnicode_AsWideChar(PyObject *unicode,
3224 wchar_t *w,
3225 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003226{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003227 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003228
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003229 if (unicode == NULL) {
3230 PyErr_BadInternalCall();
3231 return -1;
3232 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003233 if (!PyUnicode_Check(unicode)) {
3234 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003235 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003236 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003237
3238 res = unicode_get_widechar_size(unicode);
3239 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003240 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003241 }
3242
3243 if (size > res) {
3244 size = res + 1;
3245 }
3246 else {
3247 res = size;
3248 }
3249 unicode_copy_as_widechar(unicode, w, size);
3250 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003251}
3252
Victor Stinner137c34c2010-09-29 10:25:54 +00003253wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003254PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003255 Py_ssize_t *size)
3256{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003257 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003258 Py_ssize_t buflen;
3259
3260 if (unicode == NULL) {
3261 PyErr_BadInternalCall();
3262 return NULL;
3263 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003264 if (!PyUnicode_Check(unicode)) {
3265 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003266 return NULL;
3267 }
3268
Serhiy Storchakac46db922018-10-23 22:58:24 +03003269 buflen = unicode_get_widechar_size(unicode);
3270 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003271 if (buffer == NULL) {
3272 PyErr_NoMemory();
3273 return NULL;
3274 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003275 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3276 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003277 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003278 }
3279 else if (wcslen(buffer) != (size_t)buflen) {
3280 PyMem_FREE(buffer);
3281 PyErr_SetString(PyExc_ValueError,
3282 "embedded null character");
3283 return NULL;
3284 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003285 return buffer;
3286}
3287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289
Alexander Belopolsky40018472011-02-26 01:02:56 +00003290PyObject *
3291PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003292{
Victor Stinner8faf8212011-12-08 22:14:11 +01003293 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 PyErr_SetString(PyExc_ValueError,
3295 "chr() arg not in range(0x110000)");
3296 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003297 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003298
Victor Stinner985a82a2014-01-03 12:53:47 +01003299 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003300}
3301
Alexander Belopolsky40018472011-02-26 01:02:56 +00003302PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003303PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003305 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003307 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003308 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003309 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 Py_INCREF(obj);
3311 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003312 }
3313 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 /* For a Unicode subtype that's not a Unicode object,
3315 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003316 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003317 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003318 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003319 "Can't convert '%.100s' object to str implicitly",
3320 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003321 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003322}
3323
Alexander Belopolsky40018472011-02-26 01:02:56 +00003324PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003325PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003326 const char *encoding,
3327 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003328{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003329 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003330 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 PyErr_BadInternalCall();
3334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003336
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003337 /* Decoding bytes objects is the most common case and should be fast */
3338 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003339 if (PyBytes_GET_SIZE(obj) == 0) {
3340 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3341 return NULL;
3342 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003343 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003344 }
3345 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003346 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3347 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003348 }
3349
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003350 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003351 PyErr_SetString(PyExc_TypeError,
3352 "decoding str is not supported");
3353 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003354 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003355
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003356 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3357 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3358 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003359 "decoding to str: need a bytes-like object, %.80s found",
3360 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003361 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003362 }
Tim Petersced69f82003-09-16 20:30:58 +00003363
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003364 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003365 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003366 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3367 return NULL;
3368 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003369 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003371
Serhiy Storchaka05997252013-01-26 12:14:02 +02003372 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003373 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003374 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375}
3376
Victor Stinnerebe17e02016-10-12 13:57:45 +02003377/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3378 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3379 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003380int
3381_Py_normalize_encoding(const char *encoding,
3382 char *lower,
3383 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003385 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003386 char *l;
3387 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003388 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389
Victor Stinner942889a2016-09-05 15:40:10 -07003390 assert(encoding != NULL);
3391
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003392 e = encoding;
3393 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003394 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003395 punct = 0;
3396 while (1) {
3397 char c = *e;
3398 if (c == 0) {
3399 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003400 }
Victor Stinner942889a2016-09-05 15:40:10 -07003401
3402 if (Py_ISALNUM(c) || c == '.') {
3403 if (punct && l != lower) {
3404 if (l == l_end) {
3405 return 0;
3406 }
3407 *l++ = '_';
3408 }
3409 punct = 0;
3410
3411 if (l == l_end) {
3412 return 0;
3413 }
3414 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003415 }
3416 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003417 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003418 }
Victor Stinner942889a2016-09-05 15:40:10 -07003419
3420 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003421 }
3422 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003423 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003424}
3425
Alexander Belopolsky40018472011-02-26 01:02:56 +00003426PyObject *
3427PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003428 Py_ssize_t size,
3429 const char *encoding,
3430 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003431{
3432 PyObject *buffer = NULL, *unicode;
3433 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003434 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3435
Victor Stinner22eb6892019-06-26 00:51:05 +02003436 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3437 return NULL;
3438 }
3439
Victor Stinnered076ed2019-06-26 01:49:32 +02003440 if (size == 0) {
3441 _Py_RETURN_UNICODE_EMPTY();
3442 }
3443
Victor Stinner942889a2016-09-05 15:40:10 -07003444 if (encoding == NULL) {
3445 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3446 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003447
Fred Drakee4315f52000-05-09 19:53:39 +00003448 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003449 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3450 char *lower = buflower;
3451
3452 /* Fast paths */
3453 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3454 lower += 3;
3455 if (*lower == '_') {
3456 /* Match "utf8" and "utf_8" */
3457 lower++;
3458 }
3459
3460 if (lower[0] == '8' && lower[1] == 0) {
3461 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3462 }
3463 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3464 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3465 }
3466 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3467 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3468 }
3469 }
3470 else {
3471 if (strcmp(lower, "ascii") == 0
3472 || strcmp(lower, "us_ascii") == 0) {
3473 return PyUnicode_DecodeASCII(s, size, errors);
3474 }
Steve Dowercc16be82016-09-08 10:35:16 -07003475 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003476 else if (strcmp(lower, "mbcs") == 0) {
3477 return PyUnicode_DecodeMBCS(s, size, errors);
3478 }
3479 #endif
3480 else if (strcmp(lower, "latin1") == 0
3481 || strcmp(lower, "latin_1") == 0
3482 || strcmp(lower, "iso_8859_1") == 0
3483 || strcmp(lower, "iso8859_1") == 0) {
3484 return PyUnicode_DecodeLatin1(s, size, errors);
3485 }
3486 }
Victor Stinner37296e82010-06-10 13:36:23 +00003487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488
3489 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003490 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003491 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003492 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003493 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 if (buffer == NULL)
3495 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003496 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497 if (unicode == NULL)
3498 goto onError;
3499 if (!PyUnicode_Check(unicode)) {
3500 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003501 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003502 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003503 encoding,
3504 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 Py_DECREF(unicode);
3506 goto onError;
3507 }
3508 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003509 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003510
Benjamin Peterson29060642009-01-31 22:14:21 +00003511 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512 Py_XDECREF(buffer);
3513 return NULL;
3514}
3515
Alexander Belopolsky40018472011-02-26 01:02:56 +00003516PyObject *
3517PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003518 const char *encoding,
3519 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003520{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003521 if (!PyUnicode_Check(unicode)) {
3522 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003523 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003524 }
3525
Serhiy Storchaka00939072016-10-27 21:05:49 +03003526 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3527 "PyUnicode_AsDecodedObject() is deprecated; "
3528 "use PyCodec_Decode() to decode from str", 1) < 0)
3529 return NULL;
3530
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003531 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003532 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003533
3534 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003535 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003536}
3537
Alexander Belopolsky40018472011-02-26 01:02:56 +00003538PyObject *
3539PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003540 const char *encoding,
3541 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003542{
3543 PyObject *v;
3544
3545 if (!PyUnicode_Check(unicode)) {
3546 PyErr_BadArgument();
3547 goto onError;
3548 }
3549
Serhiy Storchaka00939072016-10-27 21:05:49 +03003550 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3551 "PyUnicode_AsDecodedUnicode() is deprecated; "
3552 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3553 return NULL;
3554
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003555 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003556 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003557
3558 /* Decode via the codec registry */
3559 v = PyCodec_Decode(unicode, encoding, errors);
3560 if (v == NULL)
3561 goto onError;
3562 if (!PyUnicode_Check(v)) {
3563 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003564 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003565 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003566 encoding,
3567 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003568 Py_DECREF(v);
3569 goto onError;
3570 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003571 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003572
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003574 return NULL;
3575}
3576
Alexander Belopolsky40018472011-02-26 01:02:56 +00003577PyObject *
3578PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003579 Py_ssize_t size,
3580 const char *encoding,
3581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003582{
3583 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003584
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003585 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3589 Py_DECREF(unicode);
3590 return v;
3591}
3592
Alexander Belopolsky40018472011-02-26 01:02:56 +00003593PyObject *
3594PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003595 const char *encoding,
3596 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003597{
3598 PyObject *v;
3599
3600 if (!PyUnicode_Check(unicode)) {
3601 PyErr_BadArgument();
3602 goto onError;
3603 }
3604
Serhiy Storchaka00939072016-10-27 21:05:49 +03003605 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3606 "PyUnicode_AsEncodedObject() is deprecated; "
3607 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3608 "or PyCodec_Encode() for generic encoding", 1) < 0)
3609 return NULL;
3610
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003611 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003613
3614 /* Encode via the codec registry */
3615 v = PyCodec_Encode(unicode, encoding, errors);
3616 if (v == NULL)
3617 goto onError;
3618 return v;
3619
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003621 return NULL;
3622}
3623
Victor Stinner1b579672011-12-17 05:47:23 +01003624
Victor Stinner2cba6b82018-01-10 22:46:15 +01003625static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003626unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003627 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003628{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003629 Py_ssize_t wlen;
3630 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3631 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003632 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003633 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003634
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003635 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003636 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003637 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003638 return NULL;
3639 }
3640
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003641 char *str;
3642 size_t error_pos;
3643 const char *reason;
3644 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003645 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003646 PyMem_Free(wstr);
3647
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003648 if (res != 0) {
3649 if (res == -2) {
3650 PyObject *exc;
3651 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3652 "locale", unicode,
3653 (Py_ssize_t)error_pos,
3654 (Py_ssize_t)(error_pos+1),
3655 reason);
3656 if (exc != NULL) {
3657 PyCodec_StrictErrors(exc);
3658 Py_DECREF(exc);
3659 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003660 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003661 else if (res == -3) {
3662 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3663 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003664 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003665 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003666 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003667 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003668 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003669
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003670 PyObject *bytes = PyBytes_FromString(str);
3671 PyMem_RawFree(str);
3672 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003673}
3674
Victor Stinnerad158722010-10-27 00:25:46 +00003675PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003676PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3677{
Victor Stinner709d23d2019-05-02 14:56:30 -04003678 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3679 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003680}
3681
3682PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003683PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003684{
Victor Stinner81a7be32020-04-14 15:14:01 +02003685 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003686 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3687 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003688 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003689 fs_codec->error_handler,
3690 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003691 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003692#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003693 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003694 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003695 fs_codec->encoding,
3696 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003697 }
Victor Stinnerad158722010-10-27 00:25:46 +00003698#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003699 else {
3700 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3701 machinery is not ready and so cannot be used:
3702 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003703 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3704 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003705 assert(filesystem_errors != NULL);
3706 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3707 assert(errors != _Py_ERROR_UNKNOWN);
3708#ifdef _Py_FORCE_UTF8_FS_ENCODING
3709 return unicode_encode_utf8(unicode, errors, NULL);
3710#else
3711 return unicode_encode_locale(unicode, errors, 0);
3712#endif
3713 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003714}
3715
Alexander Belopolsky40018472011-02-26 01:02:56 +00003716PyObject *
3717PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003718 const char *encoding,
3719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720{
3721 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003722 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 if (!PyUnicode_Check(unicode)) {
3725 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 }
Fred Drakee4315f52000-05-09 19:53:39 +00003728
Victor Stinner22eb6892019-06-26 00:51:05 +02003729 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3730 return NULL;
3731 }
3732
Victor Stinner942889a2016-09-05 15:40:10 -07003733 if (encoding == NULL) {
3734 return _PyUnicode_AsUTF8String(unicode, errors);
3735 }
3736
Fred Drakee4315f52000-05-09 19:53:39 +00003737 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003738 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3739 char *lower = buflower;
3740
3741 /* Fast paths */
3742 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3743 lower += 3;
3744 if (*lower == '_') {
3745 /* Match "utf8" and "utf_8" */
3746 lower++;
3747 }
3748
3749 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003750 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003751 }
3752 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3753 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3754 }
3755 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3756 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3757 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003758 }
Victor Stinner942889a2016-09-05 15:40:10 -07003759 else {
3760 if (strcmp(lower, "ascii") == 0
3761 || strcmp(lower, "us_ascii") == 0) {
3762 return _PyUnicode_AsASCIIString(unicode, errors);
3763 }
Steve Dowercc16be82016-09-08 10:35:16 -07003764#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003765 else if (strcmp(lower, "mbcs") == 0) {
3766 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3767 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003768#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003769 else if (strcmp(lower, "latin1") == 0 ||
3770 strcmp(lower, "latin_1") == 0 ||
3771 strcmp(lower, "iso_8859_1") == 0 ||
3772 strcmp(lower, "iso8859_1") == 0) {
3773 return _PyUnicode_AsLatin1String(unicode, errors);
3774 }
3775 }
Victor Stinner37296e82010-06-10 13:36:23 +00003776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777
3778 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003779 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003781 return NULL;
3782
3783 /* The normal path */
3784 if (PyBytes_Check(v))
3785 return v;
3786
3787 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003788 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003789 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003790 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003791
3792 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003793 "encoder %s returned bytearray instead of bytes; "
3794 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003795 encoding);
3796 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003797 Py_DECREF(v);
3798 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003799 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003800
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003801 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3802 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003803 Py_DECREF(v);
3804 return b;
3805 }
3806
3807 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003808 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003809 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003810 encoding,
3811 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003812 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003813 return NULL;
3814}
3815
Alexander Belopolsky40018472011-02-26 01:02:56 +00003816PyObject *
3817PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003818 const char *encoding,
3819 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003820{
3821 PyObject *v;
3822
3823 if (!PyUnicode_Check(unicode)) {
3824 PyErr_BadArgument();
3825 goto onError;
3826 }
3827
Serhiy Storchaka00939072016-10-27 21:05:49 +03003828 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3829 "PyUnicode_AsEncodedUnicode() is deprecated; "
3830 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3831 return NULL;
3832
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003833 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003834 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003835
3836 /* Encode via the codec registry */
3837 v = PyCodec_Encode(unicode, encoding, errors);
3838 if (v == NULL)
3839 goto onError;
3840 if (!PyUnicode_Check(v)) {
3841 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003842 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003843 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003844 encoding,
3845 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003846 Py_DECREF(v);
3847 goto onError;
3848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003850
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 return NULL;
3853}
3854
Victor Stinner2cba6b82018-01-10 22:46:15 +01003855static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003856unicode_decode_locale(const char *str, Py_ssize_t len,
3857 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003858{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003859 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3860 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003861 return NULL;
3862 }
3863
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003864 wchar_t *wstr;
3865 size_t wlen;
3866 const char *reason;
3867 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003868 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003869 if (res != 0) {
3870 if (res == -2) {
3871 PyObject *exc;
3872 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3873 "locale", str, len,
3874 (Py_ssize_t)wlen,
3875 (Py_ssize_t)(wlen + 1),
3876 reason);
3877 if (exc != NULL) {
3878 PyCodec_StrictErrors(exc);
3879 Py_DECREF(exc);
3880 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003881 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003882 else if (res == -3) {
3883 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3884 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003885 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003886 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003887 }
Victor Stinner2f197072011-12-17 07:08:30 +01003888 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003889 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003890
3891 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3892 PyMem_RawFree(wstr);
3893 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003894}
3895
3896PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003897PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3898 const char *errors)
3899{
Victor Stinner709d23d2019-05-02 14:56:30 -04003900 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3901 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003902}
3903
3904PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003905PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003906{
3907 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003908 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3909 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003910}
3911
3912
3913PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003914PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003915 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003916 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3917}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003918
Christian Heimes5894ba72007-11-04 11:43:14 +00003919PyObject*
3920PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3921{
Victor Stinner81a7be32020-04-14 15:14:01 +02003922 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003923 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3924 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003925 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003926 fs_codec->error_handler,
3927 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04003928 NULL);
3929 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003930#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003931 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003932 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02003933 fs_codec->encoding,
3934 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003935 }
Victor Stinnerad158722010-10-27 00:25:46 +00003936#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003937 else {
3938 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3939 machinery is not ready and so cannot be used:
3940 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003941 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3942 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003943 assert(filesystem_errors != NULL);
3944 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3945 assert(errors != _Py_ERROR_UNKNOWN);
3946#ifdef _Py_FORCE_UTF8_FS_ENCODING
3947 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3948#else
3949 return unicode_decode_locale(s, size, errors, 0);
3950#endif
3951 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003952}
3953
Martin v. Löwis011e8422009-05-05 04:43:17 +00003954
3955int
3956PyUnicode_FSConverter(PyObject* arg, void* addr)
3957{
Brett Cannonec6ce872016-09-06 15:50:29 -07003958 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003959 PyObject *output = NULL;
3960 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03003961 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003962 if (arg == NULL) {
3963 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003964 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003965 return 1;
3966 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003967 path = PyOS_FSPath(arg);
3968 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003969 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003970 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003971 if (PyBytes_Check(path)) {
3972 output = path;
3973 }
3974 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3975 output = PyUnicode_EncodeFSDefault(path);
3976 Py_DECREF(path);
3977 if (!output) {
3978 return 0;
3979 }
3980 assert(PyBytes_Check(output));
3981 }
3982
Victor Stinner0ea2a462010-04-30 00:22:08 +00003983 size = PyBytes_GET_SIZE(output);
3984 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003985 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003986 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003987 Py_DECREF(output);
3988 return 0;
3989 }
3990 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003991 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003992}
3993
3994
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003995int
3996PyUnicode_FSDecoder(PyObject* arg, void* addr)
3997{
Brett Cannona5711202016-09-06 19:36:01 -07003998 int is_buffer = 0;
3999 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004000 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004001 if (arg == NULL) {
4002 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004003 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004004 return 1;
4005 }
Brett Cannona5711202016-09-06 19:36:01 -07004006
4007 is_buffer = PyObject_CheckBuffer(arg);
4008 if (!is_buffer) {
4009 path = PyOS_FSPath(arg);
4010 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004011 return 0;
4012 }
Brett Cannona5711202016-09-06 19:36:01 -07004013 }
4014 else {
4015 path = arg;
4016 Py_INCREF(arg);
4017 }
4018
4019 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004020 output = path;
4021 }
4022 else if (PyBytes_Check(path) || is_buffer) {
4023 PyObject *path_bytes = NULL;
4024
4025 if (!PyBytes_Check(path) &&
4026 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004027 "path should be string, bytes, or os.PathLike, not %.200s",
4028 Py_TYPE(arg)->tp_name)) {
4029 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004030 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004031 }
4032 path_bytes = PyBytes_FromObject(path);
4033 Py_DECREF(path);
4034 if (!path_bytes) {
4035 return 0;
4036 }
4037 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4038 PyBytes_GET_SIZE(path_bytes));
4039 Py_DECREF(path_bytes);
4040 if (!output) {
4041 return 0;
4042 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004043 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004044 else {
4045 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004046 "path should be string, bytes, or os.PathLike, not %.200s",
4047 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004048 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004049 return 0;
4050 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004051 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004052 Py_DECREF(output);
4053 return 0;
4054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004056 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004057 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004058 Py_DECREF(output);
4059 return 0;
4060 }
4061 *(PyObject**)addr = output;
4062 return Py_CLEANUP_SUPPORTED;
4063}
4064
4065
Inada Naoki02a4d572020-02-27 13:48:59 +09004066static int unicode_fill_utf8(PyObject *unicode);
4067
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004068const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004070{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004071 if (!PyUnicode_Check(unicode)) {
4072 PyErr_BadArgument();
4073 return NULL;
4074 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004075 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004076 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004078 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004079 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 return NULL;
4081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 }
4083
4084 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004085 *psize = PyUnicode_UTF8_LENGTH(unicode);
4086 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004087}
4088
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004089const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4093}
4094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095Py_UNICODE *
4096PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098 if (!PyUnicode_Check(unicode)) {
4099 PyErr_BadArgument();
4100 return NULL;
4101 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004102 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4103 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004105 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004106 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107
Serhiy Storchakac46db922018-10-23 22:58:24 +03004108 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4109 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4110 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004113 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4114 if (w == NULL) {
4115 PyErr_NoMemory();
4116 return NULL;
4117 }
4118 unicode_copy_as_widechar(unicode, w, wlen + 1);
4119 _PyUnicode_WSTR(unicode) = w;
4120 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4121 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004122 }
4123 }
4124 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004125 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004126 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004127}
4128
Inada Naoki2c4928d2020-06-17 20:09:44 +09004129/* Deprecated APIs */
4130
4131_Py_COMP_DIAG_PUSH
4132_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4133
Alexander Belopolsky40018472011-02-26 01:02:56 +00004134Py_UNICODE *
4135PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138}
4139
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004140const Py_UNICODE *
4141_PyUnicode_AsUnicode(PyObject *unicode)
4142{
4143 Py_ssize_t size;
4144 const Py_UNICODE *wstr;
4145
4146 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4147 if (wstr && wcslen(wstr) != (size_t)size) {
4148 PyErr_SetString(PyExc_ValueError, "embedded null character");
4149 return NULL;
4150 }
4151 return wstr;
4152}
4153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154
Alexander Belopolsky40018472011-02-26 01:02:56 +00004155Py_ssize_t
4156PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157{
4158 if (!PyUnicode_Check(unicode)) {
4159 PyErr_BadArgument();
4160 goto onError;
4161 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004162 if (_PyUnicode_WSTR(unicode) == NULL) {
4163 if (PyUnicode_AsUnicode(unicode) == NULL)
4164 goto onError;
4165 }
4166 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 return -1;
4170}
4171
Inada Naoki2c4928d2020-06-17 20:09:44 +09004172_Py_COMP_DIAG_POP
4173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174Py_ssize_t
4175PyUnicode_GetLength(PyObject *unicode)
4176{
Victor Stinner07621332012-06-16 04:53:46 +02004177 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 PyErr_BadArgument();
4179 return -1;
4180 }
Victor Stinner07621332012-06-16 04:53:46 +02004181 if (PyUnicode_READY(unicode) == -1)
4182 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004183 return PyUnicode_GET_LENGTH(unicode);
4184}
4185
4186Py_UCS4
4187PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4188{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004189 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004190 int kind;
4191
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004192 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004193 PyErr_BadArgument();
4194 return (Py_UCS4)-1;
4195 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004196 if (PyUnicode_READY(unicode) == -1) {
4197 return (Py_UCS4)-1;
4198 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004199 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004200 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 return (Py_UCS4)-1;
4202 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004203 data = PyUnicode_DATA(unicode);
4204 kind = PyUnicode_KIND(unicode);
4205 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206}
4207
4208int
4209PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4210{
4211 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004212 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213 return -1;
4214 }
Victor Stinner488fa492011-12-12 00:01:39 +01004215 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004216 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004217 PyErr_SetString(PyExc_IndexError, "string index out of range");
4218 return -1;
4219 }
Victor Stinner488fa492011-12-12 00:01:39 +01004220 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004221 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004222 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4223 PyErr_SetString(PyExc_ValueError, "character out of range");
4224 return -1;
4225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4227 index, ch);
4228 return 0;
4229}
4230
Alexander Belopolsky40018472011-02-26 01:02:56 +00004231const char *
4232PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004233{
Victor Stinner42cb4622010-09-01 19:39:01 +00004234 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004235}
4236
Victor Stinner554f3f02010-06-16 23:33:54 +00004237/* create or adjust a UnicodeDecodeError */
4238static void
4239make_decode_exception(PyObject **exceptionObject,
4240 const char *encoding,
4241 const char *input, Py_ssize_t length,
4242 Py_ssize_t startpos, Py_ssize_t endpos,
4243 const char *reason)
4244{
4245 if (*exceptionObject == NULL) {
4246 *exceptionObject = PyUnicodeDecodeError_Create(
4247 encoding, input, length, startpos, endpos, reason);
4248 }
4249 else {
4250 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4251 goto onError;
4252 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4253 goto onError;
4254 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4255 goto onError;
4256 }
4257 return;
4258
4259onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004260 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004261}
4262
Steve Dowercc16be82016-09-08 10:35:16 -07004263#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004264static int
4265widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4266{
4267 if (newsize > *size) {
4268 wchar_t *newbuf = *buf;
4269 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4270 PyErr_NoMemory();
4271 return -1;
4272 }
4273 *buf = newbuf;
4274 }
4275 *size = newsize;
4276 return 0;
4277}
4278
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279/* error handling callback helper:
4280 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004281 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 and adjust various state variables.
4283 return 0 on success, -1 on error
4284*/
4285
Alexander Belopolsky40018472011-02-26 01:02:56 +00004286static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287unicode_decode_call_errorhandler_wchar(
4288 const char *errors, PyObject **errorHandler,
4289 const char *encoding, const char *reason,
4290 const char **input, const char **inend, Py_ssize_t *startinpos,
4291 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004292 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004294 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295
4296 PyObject *restuple = NULL;
4297 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004298 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004299 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004300 Py_ssize_t requiredsize;
4301 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004302 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 wchar_t *repwstr;
4304 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004305
4306 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004307 *errorHandler = PyCodec_LookupError(errors);
4308 if (*errorHandler == NULL)
4309 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 }
4311
Victor Stinner554f3f02010-06-16 23:33:54 +00004312 make_decode_exception(exceptionObject,
4313 encoding,
4314 *input, *inend - *input,
4315 *startinpos, *endinpos,
4316 reason);
4317 if (*exceptionObject == NULL)
4318 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319
Petr Viktorinffd97532020-02-11 17:46:57 +01004320 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004322 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004323 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004324 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004325 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004327 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004328 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004329
4330 /* Copy back the bytes variables, which might have been modified by the
4331 callback */
4332 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4333 if (!inputobj)
4334 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335 *input = PyBytes_AS_STRING(inputobj);
4336 insize = PyBytes_GET_SIZE(inputobj);
4337 *inend = *input + insize;
4338 /* we can DECREF safely, as the exception has another reference,
4339 so the object won't go away. */
4340 Py_DECREF(inputobj);
4341
4342 if (newpos<0)
4343 newpos = insize+newpos;
4344 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004345 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 goto onError;
4347 }
4348
4349 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4350 if (repwstr == NULL)
4351 goto onError;
4352 /* need more space? (at least enough for what we
4353 have+the replacement+the rest of the string (starting
4354 at the new input position), so we won't have to check space
4355 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004356 requiredsize = *outpos;
4357 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4358 goto overflow;
4359 requiredsize += repwlen;
4360 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4361 goto overflow;
4362 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004363 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004364 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004365 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004367 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004368 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004369 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004370 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004371 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004372 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004373 *endinpos = newpos;
4374 *inptr = *input + newpos;
4375
4376 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004377 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004378 return 0;
4379
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004380 overflow:
4381 PyErr_SetString(PyExc_OverflowError,
4382 "decoded result is too long for a Python string");
4383
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 onError:
4385 Py_XDECREF(restuple);
4386 return -1;
4387}
Steve Dowercc16be82016-09-08 10:35:16 -07004388#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389
4390static int
4391unicode_decode_call_errorhandler_writer(
4392 const char *errors, PyObject **errorHandler,
4393 const char *encoding, const char *reason,
4394 const char **input, const char **inend, Py_ssize_t *startinpos,
4395 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4396 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4397{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004398 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399
4400 PyObject *restuple = NULL;
4401 PyObject *repunicode = NULL;
4402 Py_ssize_t insize;
4403 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004404 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004405 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004407 int need_to_grow = 0;
4408 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409
4410 if (*errorHandler == NULL) {
4411 *errorHandler = PyCodec_LookupError(errors);
4412 if (*errorHandler == NULL)
4413 goto onError;
4414 }
4415
4416 make_decode_exception(exceptionObject,
4417 encoding,
4418 *input, *inend - *input,
4419 *startinpos, *endinpos,
4420 reason);
4421 if (*exceptionObject == NULL)
4422 goto onError;
4423
Petr Viktorinffd97532020-02-11 17:46:57 +01004424 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 if (restuple == NULL)
4426 goto onError;
4427 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004428 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 goto onError;
4430 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004431 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004433
4434 /* Copy back the bytes variables, which might have been modified by the
4435 callback */
4436 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4437 if (!inputobj)
4438 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004439 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004440 *input = PyBytes_AS_STRING(inputobj);
4441 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004442 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004443 /* we can DECREF safely, as the exception has another reference,
4444 so the object won't go away. */
4445 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004446
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004449 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004450 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453
Victor Stinner170ca6f2013-04-18 00:25:28 +02004454 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004455 if (replen > 1) {
4456 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004457 need_to_grow = 1;
4458 }
4459 new_inptr = *input + newpos;
4460 if (*inend - new_inptr > remain) {
4461 /* We don't know the decoding algorithm here so we make the worst
4462 assumption that one byte decodes to one unicode character.
4463 If unfortunately one byte could decode to more unicode characters,
4464 the decoder may write out-of-bound then. Is it possible for the
4465 algorithms using this function? */
4466 writer->min_length += *inend - new_inptr - remain;
4467 need_to_grow = 1;
4468 }
4469 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004470 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004471 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004472 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4473 goto onError;
4474 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004475 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004476 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004479 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004480
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004482 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004483 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004484
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004487 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004488}
4489
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490/* --- UTF-7 Codec -------------------------------------------------------- */
4491
Antoine Pitrou244651a2009-05-04 18:56:13 +00004492/* See RFC2152 for details. We encode conservatively and decode liberally. */
4493
4494/* Three simple macros defining base-64. */
4495
4496/* Is c a base-64 character? */
4497
4498#define IS_BASE64(c) \
4499 (((c) >= 'A' && (c) <= 'Z') || \
4500 ((c) >= 'a' && (c) <= 'z') || \
4501 ((c) >= '0' && (c) <= '9') || \
4502 (c) == '+' || (c) == '/')
4503
4504/* given that c is a base-64 character, what is its base-64 value? */
4505
4506#define FROM_BASE64(c) \
4507 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4508 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4509 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4510 (c) == '+' ? 62 : 63)
4511
4512/* What is the base-64 character of the bottom 6 bits of n? */
4513
4514#define TO_BASE64(n) \
4515 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4516
4517/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4518 * decoded as itself. We are permissive on decoding; the only ASCII
4519 * byte not decoding to itself is the + which begins a base64
4520 * string. */
4521
4522#define DECODE_DIRECT(c) \
4523 ((c) <= 127 && (c) != '+')
4524
4525/* The UTF-7 encoder treats ASCII characters differently according to
4526 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4527 * the above). See RFC2152. This array identifies these different
4528 * sets:
4529 * 0 : "Set D"
4530 * alphanumeric and '(),-./:?
4531 * 1 : "Set O"
4532 * !"#$%&*;<=>@[]^_`{|}
4533 * 2 : "whitespace"
4534 * ht nl cr sp
4535 * 3 : special (must be base64 encoded)
4536 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4537 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538
Tim Petersced69f82003-09-16 20:30:58 +00004539static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540char utf7_category[128] = {
4541/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4542 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4543/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4544 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4545/* sp ! " # $ % & ' ( ) * + , - . / */
4546 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4547/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4549/* @ A B C D E F G H I J K L M N O */
4550 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4551/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4552 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4553/* ` a b c d e f g h i j k l m n o */
4554 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4555/* p q r s t u v w x y z { | } ~ del */
4556 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557};
4558
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559/* ENCODE_DIRECT: this character should be encoded as itself. The
4560 * answer depends on whether we are encoding set O as itself, and also
4561 * on whether we are encoding whitespace as itself. RFC2152 makes it
4562 * clear that the answers to these questions vary between
4563 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004564
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565#define ENCODE_DIRECT(c, directO, directWS) \
4566 ((c) < 128 && (c) > 0 && \
4567 ((utf7_category[(c)] == 0) || \
4568 (directWS && (utf7_category[(c)] == 2)) || \
4569 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004570
Alexander Belopolsky40018472011-02-26 01:02:56 +00004571PyObject *
4572PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004573 Py_ssize_t size,
4574 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004575{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004576 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4577}
4578
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579/* The decoder. The only state we preserve is our read position,
4580 * i.e. how many characters we have consumed. So if we end in the
4581 * middle of a shift sequence we have to back off the read position
4582 * and the output to the beginning of the sequence, otherwise we lose
4583 * all the shift state (seen bits, number of bits seen, high
4584 * surrogate). */
4585
Alexander Belopolsky40018472011-02-26 01:02:56 +00004586PyObject *
4587PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004588 Py_ssize_t size,
4589 const char *errors,
4590 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004591{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 Py_ssize_t startinpos;
4594 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004595 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004597 const char *errmsg = "";
4598 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 unsigned int base64bits = 0;
4601 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004602 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 PyObject *errorHandler = NULL;
4604 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004606 if (size == 0) {
4607 if (consumed)
4608 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004609 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004610 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004612 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004613 _PyUnicodeWriter_Init(&writer);
4614 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004615
4616 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617 e = s + size;
4618
4619 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004620 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004622 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 if (inShift) { /* in a base-64 section */
4625 if (IS_BASE64(ch)) { /* consume a base-64 character */
4626 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4627 base64bits += 6;
4628 s++;
4629 if (base64bits >= 16) {
4630 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004631 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 base64bits -= 16;
4633 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004634 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 if (surrogate) {
4636 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004637 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4638 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004639 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004640 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004642 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 }
4644 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004645 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004646 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 }
4649 }
Victor Stinner551ac952011-11-29 22:58:13 +01004650 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 /* first surrogate */
4652 surrogate = outCh;
4653 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004655 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004656 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 }
4658 }
4659 }
4660 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 if (base64bits > 0) { /* left-over bits */
4663 if (base64bits >= 6) {
4664 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004665 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 errmsg = "partial character in shift sequence";
4667 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 else {
4670 /* Some bits remain; they should be zero */
4671 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004672 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 errmsg = "non-zero padding bits in shift sequence";
4674 goto utf7Error;
4675 }
4676 }
4677 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004678 if (surrogate && DECODE_DIRECT(ch)) {
4679 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4680 goto onError;
4681 }
4682 surrogate = 0;
4683 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 /* '-' is absorbed; other terminating
4685 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004686 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688 }
4689 }
4690 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 s++; /* consume '+' */
4693 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004695 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004696 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004698 else if (s < e && !IS_BASE64(*s)) {
4699 s++;
4700 errmsg = "ill-formed sequence";
4701 goto utf7Error;
4702 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004705 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004708 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709 }
4710 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004713 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004716 else {
4717 startinpos = s-starts;
4718 s++;
4719 errmsg = "unexpected special character";
4720 goto utf7Error;
4721 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004723utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004724 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 errors, &errorHandler,
4727 "utf7", errmsg,
4728 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004729 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731 }
4732
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 /* end of string */
4734
4735 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4736 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004737 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004738 if (surrogate ||
4739 (base64bits >= 6) ||
4740 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004741 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004742 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004743 errors, &errorHandler,
4744 "utf7", "unterminated shift sequence",
4745 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004746 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004747 goto onError;
4748 if (s < e)
4749 goto restart;
4750 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752
4753 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004754 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004755 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004756 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004757 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004758 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004759 writer.kind, writer.data, shiftOutStart);
4760 Py_XDECREF(errorHandler);
4761 Py_XDECREF(exc);
4762 _PyUnicodeWriter_Dealloc(&writer);
4763 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004764 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004765 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 }
4767 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004768 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004770 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 Py_XDECREF(errorHandler);
4773 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004774 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004775
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 Py_XDECREF(errorHandler);
4778 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004779 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780 return NULL;
4781}
4782
4783
Alexander Belopolsky40018472011-02-26 01:02:56 +00004784PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004785_PyUnicode_EncodeUTF7(PyObject *str,
4786 int base64SetO,
4787 int base64WhiteSpace,
4788 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004789{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004790 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004791 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004792 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004793 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004794 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004795 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 unsigned int base64bits = 0;
4797 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004798 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004799 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004800
Benjamin Petersonbac79492012-01-14 13:34:47 -05004801 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004802 return NULL;
4803 kind = PyUnicode_KIND(str);
4804 data = PyUnicode_DATA(str);
4805 len = PyUnicode_GET_LENGTH(str);
4806
4807 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004808 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004809
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004810 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004811 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004812 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004813 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004814 if (v == NULL)
4815 return NULL;
4816
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004817 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004818 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004819 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004820
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 if (inShift) {
4822 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4823 /* shifting out */
4824 if (base64bits) { /* output remaining bits */
4825 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4826 base64buffer = 0;
4827 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828 }
4829 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004830 /* Characters not in the BASE64 set implicitly unshift the sequence
4831 so no '-' is required, except if the character is itself a '-' */
4832 if (IS_BASE64(ch) || ch == '-') {
4833 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004834 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 *out++ = (char) ch;
4836 }
4837 else {
4838 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004839 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 else { /* not in a shift sequence */
4842 if (ch == '+') {
4843 *out++ = '+';
4844 *out++ = '-';
4845 }
4846 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4847 *out++ = (char) ch;
4848 }
4849 else {
4850 *out++ = '+';
4851 inShift = 1;
4852 goto encode_char;
4853 }
4854 }
4855 continue;
4856encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004858 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004859
Antoine Pitrou244651a2009-05-04 18:56:13 +00004860 /* code first surrogate */
4861 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004862 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863 while (base64bits >= 6) {
4864 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4865 base64bits -= 6;
4866 }
4867 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004868 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004869 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004870 base64bits += 16;
4871 base64buffer = (base64buffer << 16) | ch;
4872 while (base64bits >= 6) {
4873 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4874 base64bits -= 6;
4875 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004876 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004877 if (base64bits)
4878 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4879 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004880 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004881 if (_PyBytes_Resize(&v, out - start) < 0)
4882 return NULL;
4883 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004884}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004885PyObject *
4886PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4887 Py_ssize_t size,
4888 int base64SetO,
4889 int base64WhiteSpace,
4890 const char *errors)
4891{
4892 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004893 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004894 if (tmp == NULL)
4895 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004896 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004897 base64WhiteSpace, errors);
4898 Py_DECREF(tmp);
4899 return result;
4900}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004901
Antoine Pitrou244651a2009-05-04 18:56:13 +00004902#undef IS_BASE64
4903#undef FROM_BASE64
4904#undef TO_BASE64
4905#undef DECODE_DIRECT
4906#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004907
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908/* --- UTF-8 Codec -------------------------------------------------------- */
4909
Alexander Belopolsky40018472011-02-26 01:02:56 +00004910PyObject *
4911PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004912 Py_ssize_t size,
4913 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
Walter Dörwald69652032004-09-07 20:24:22 +00004915 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4916}
4917
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918#include "stringlib/asciilib.h"
4919#include "stringlib/codecs.h"
4920#include "stringlib/undef.h"
4921
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004922#include "stringlib/ucs1lib.h"
4923#include "stringlib/codecs.h"
4924#include "stringlib/undef.h"
4925
4926#include "stringlib/ucs2lib.h"
4927#include "stringlib/codecs.h"
4928#include "stringlib/undef.h"
4929
4930#include "stringlib/ucs4lib.h"
4931#include "stringlib/codecs.h"
4932#include "stringlib/undef.h"
4933
Antoine Pitrouab868312009-01-10 15:40:25 +00004934/* Mask to quickly check whether a C 'long' contains a
4935 non-ASCII, UTF8-encoded char. */
4936#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004937# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004938#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004939# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004940#else
4941# error C 'long' size should be either 4 or 8!
4942#endif
4943
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944static Py_ssize_t
4945ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004946{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004947 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004948 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004949
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004950 /*
4951 * Issue #17237: m68k is a bit different from most architectures in
4952 * that objects do not use "natural alignment" - for example, int and
4953 * long are only aligned at 2-byte boundaries. Therefore the assert()
4954 * won't work; also, tests have shown that skipping the "optimised
4955 * version" will even speed up m68k.
4956 */
4957#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004958#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004959 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4960 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 /* Fast path, see in STRINGLIB(utf8_decode) for
4962 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004963 /* Help allocation */
4964 const char *_p = p;
4965 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004966 while (_p < aligned_end) {
4967 unsigned long value = *(const unsigned long *) _p;
4968 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970 *((unsigned long *)q) = value;
4971 _p += SIZEOF_LONG;
4972 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004973 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 p = _p;
4975 while (p < end) {
4976 if ((unsigned char)*p & 0x80)
4977 break;
4978 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004983#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 while (p < end) {
4985 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4986 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004987 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004988 /* Help allocation */
4989 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004991 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004992 if (value & ASCII_CHAR_MASK)
4993 break;
4994 _p += SIZEOF_LONG;
4995 }
4996 p = _p;
4997 if (_p == end)
4998 break;
4999 }
5000 if ((unsigned char)*p & 0x80)
5001 break;
5002 ++p;
5003 }
5004 memcpy(dest, start, p - start);
5005 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006}
Antoine Pitrouab868312009-01-10 15:40:25 +00005007
Victor Stinner709d23d2019-05-02 14:56:30 -04005008static PyObject *
5009unicode_decode_utf8(const char *s, Py_ssize_t size,
5010 _Py_error_handler error_handler, const char *errors,
5011 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005012{
Victor Stinner785938e2011-12-11 20:09:03 +01005013 if (size == 0) {
5014 if (consumed)
5015 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005016 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005017 }
5018
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5020 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005021 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005023 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005025 }
5026
Inada Naoki770847a2019-06-24 12:30:24 +09005027 const char *starts = s;
5028 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005029
Inada Naoki770847a2019-06-24 12:30:24 +09005030 // fast path: try ASCII string.
5031 PyObject *u = PyUnicode_New(size, 127);
5032 if (u == NULL) {
5033 return NULL;
5034 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005035 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005036 if (s == end) {
5037 return u;
5038 }
5039
5040 // Use _PyUnicodeWriter after fast path is failed.
5041 _PyUnicodeWriter writer;
5042 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5043 writer.pos = s - starts;
5044
5045 Py_ssize_t startinpos, endinpos;
5046 const char *errmsg = "";
5047 PyObject *error_handler_obj = NULL;
5048 PyObject *exc = NULL;
5049
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050 while (s < end) {
5051 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005052 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005053
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005054 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005055 if (PyUnicode_IS_ASCII(writer.buffer))
5056 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005058 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005060 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005061 } else {
5062 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005063 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064 }
5065
5066 switch (ch) {
5067 case 0:
5068 if (s == end || consumed)
5069 goto End;
5070 errmsg = "unexpected end of data";
5071 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005072 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 break;
5074 case 1:
5075 errmsg = "invalid start byte";
5076 startinpos = s - starts;
5077 endinpos = startinpos + 1;
5078 break;
5079 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005080 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5081 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5082 {
5083 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005084 goto End;
5085 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005086 /* fall through */
5087 case 3:
5088 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 errmsg = "invalid continuation byte";
5090 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005091 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 break;
5093 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005094 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 goto onError;
5096 continue;
5097 }
5098
Victor Stinner1d65d912015-10-05 13:43:50 +02005099 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005100 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005101
5102 switch (error_handler) {
5103 case _Py_ERROR_IGNORE:
5104 s += (endinpos - startinpos);
5105 break;
5106
5107 case _Py_ERROR_REPLACE:
5108 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5109 goto onError;
5110 s += (endinpos - startinpos);
5111 break;
5112
5113 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005114 {
5115 Py_ssize_t i;
5116
Victor Stinner1d65d912015-10-05 13:43:50 +02005117 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5118 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005119 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005120 ch = (Py_UCS4)(unsigned char)(starts[i]);
5121 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5122 ch + 0xdc00);
5123 writer.pos++;
5124 }
5125 s += (endinpos - startinpos);
5126 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005127 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005128
5129 default:
5130 if (unicode_decode_call_errorhandler_writer(
5131 errors, &error_handler_obj,
5132 "utf-8", errmsg,
5133 &starts, &end, &startinpos, &endinpos, &exc, &s,
5134 &writer))
5135 goto onError;
5136 }
Victor Stinner785938e2011-12-11 20:09:03 +01005137 }
5138
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005139End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 if (consumed)
5141 *consumed = s - starts;
5142
Victor Stinner1d65d912015-10-05 13:43:50 +02005143 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005145 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005146
5147onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005148 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005150 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005152}
5153
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005154
Victor Stinner709d23d2019-05-02 14:56:30 -04005155PyObject *
5156PyUnicode_DecodeUTF8Stateful(const char *s,
5157 Py_ssize_t size,
5158 const char *errors,
5159 Py_ssize_t *consumed)
5160{
5161 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5162}
5163
5164
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005165/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5166 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005167
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 On success, write a pointer to a newly allocated wide character string into
5169 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5170 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005171
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005172 On memory allocation failure, return -1.
5173
5174 On decoding error (if surrogateescape is zero), return -2. If wlen is
5175 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5176 is not NULL, write the decoding error message into *reason. */
5177int
5178_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005179 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005180{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005181 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005182 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005183 wchar_t *unicode;
5184 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005185
Victor Stinner3d4226a2018-08-29 22:21:32 +02005186 int surrogateescape = 0;
5187 int surrogatepass = 0;
5188 switch (errors)
5189 {
5190 case _Py_ERROR_STRICT:
5191 break;
5192 case _Py_ERROR_SURROGATEESCAPE:
5193 surrogateescape = 1;
5194 break;
5195 case _Py_ERROR_SURROGATEPASS:
5196 surrogatepass = 1;
5197 break;
5198 default:
5199 return -3;
5200 }
5201
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005202 /* Note: size will always be longer than the resulting Unicode
5203 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005204 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005206 }
5207
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005208 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005209 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005210 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005211 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005212
5213 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005214 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005215 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005216 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005217 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005218#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005219 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005220#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005221 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005222#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005223 if (ch > 0xFF) {
5224#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005225 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005226#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005227 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005228 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005229 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5230 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5231#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005232 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005233 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005234 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005235 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005236 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005237
5238 if (surrogateescape) {
5239 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5240 }
5241 else {
5242 /* Is it a valid three-byte code? */
5243 if (surrogatepass
5244 && (e - s) >= 3
5245 && (s[0] & 0xf0) == 0xe0
5246 && (s[1] & 0xc0) == 0x80
5247 && (s[2] & 0xc0) == 0x80)
5248 {
5249 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5250 s += 3;
5251 unicode[outpos++] = ch;
5252 }
5253 else {
5254 PyMem_RawFree(unicode );
5255 if (reason != NULL) {
5256 switch (ch) {
5257 case 0:
5258 *reason = "unexpected end of data";
5259 break;
5260 case 1:
5261 *reason = "invalid start byte";
5262 break;
5263 /* 2, 3, 4 */
5264 default:
5265 *reason = "invalid continuation byte";
5266 break;
5267 }
5268 }
5269 if (wlen != NULL) {
5270 *wlen = s - orig_s;
5271 }
5272 return -2;
5273 }
5274 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005275 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005276 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005277 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005278 if (wlen) {
5279 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005280 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005281 *wstr = unicode;
5282 return 0;
5283}
5284
Victor Stinner5f9cf232019-03-19 01:46:25 +01005285
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005286wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005287_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5288 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289{
5290 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005291 int res = _Py_DecodeUTF8Ex(arg, arglen,
5292 &wstr, wlen,
5293 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005294 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005295 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5296 assert(res != -3);
5297 if (wlen) {
5298 *wlen = (size_t)res;
5299 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005300 return NULL;
5301 }
5302 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005303}
5304
Antoine Pitrouab868312009-01-10 15:40:25 +00005305
Victor Stinnere47e6982017-12-21 15:45:16 +01005306/* UTF-8 encoder using the surrogateescape error handler .
5307
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005308 On success, return 0 and write the newly allocated character string (use
5309 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005310
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005311 On encoding failure, return -2 and write the position of the invalid
5312 surrogate character into *error_pos (if error_pos is set) and the decoding
5313 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005314
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005315 On memory allocation failure, return -1. */
5316int
5317_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005318 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005319{
5320 const Py_ssize_t max_char_size = 4;
5321 Py_ssize_t len = wcslen(text);
5322
5323 assert(len >= 0);
5324
Victor Stinner3d4226a2018-08-29 22:21:32 +02005325 int surrogateescape = 0;
5326 int surrogatepass = 0;
5327 switch (errors)
5328 {
5329 case _Py_ERROR_STRICT:
5330 break;
5331 case _Py_ERROR_SURROGATEESCAPE:
5332 surrogateescape = 1;
5333 break;
5334 case _Py_ERROR_SURROGATEPASS:
5335 surrogatepass = 1;
5336 break;
5337 default:
5338 return -3;
5339 }
5340
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005341 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5342 return -1;
5343 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005344 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005345 if (raw_malloc) {
5346 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005347 }
5348 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005349 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005350 }
5351 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005352 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005353 }
5354
5355 char *p = bytes;
5356 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005357 for (i = 0; i < len; ) {
5358 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005359 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005360 i++;
5361#if Py_UNICODE_SIZE == 2
5362 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5363 && i < len
5364 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5365 {
5366 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5367 i++;
5368 }
5369#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005370
5371 if (ch < 0x80) {
5372 /* Encode ASCII */
5373 *p++ = (char) ch;
5374
5375 }
5376 else if (ch < 0x0800) {
5377 /* Encode Latin-1 */
5378 *p++ = (char)(0xc0 | (ch >> 6));
5379 *p++ = (char)(0x80 | (ch & 0x3f));
5380 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005381 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005382 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005383 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005384 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005385 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005386 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005387 if (reason != NULL) {
5388 *reason = "encoding error";
5389 }
5390 if (raw_malloc) {
5391 PyMem_RawFree(bytes);
5392 }
5393 else {
5394 PyMem_Free(bytes);
5395 }
5396 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005397 }
5398 *p++ = (char)(ch & 0xff);
5399 }
5400 else if (ch < 0x10000) {
5401 *p++ = (char)(0xe0 | (ch >> 12));
5402 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5403 *p++ = (char)(0x80 | (ch & 0x3f));
5404 }
5405 else { /* ch >= 0x10000 */
5406 assert(ch <= MAX_UNICODE);
5407 /* Encode UCS4 Unicode ordinals */
5408 *p++ = (char)(0xf0 | (ch >> 18));
5409 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5410 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5411 *p++ = (char)(0x80 | (ch & 0x3f));
5412 }
5413 }
5414 *p++ = '\0';
5415
5416 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005417 char *bytes2;
5418 if (raw_malloc) {
5419 bytes2 = PyMem_RawRealloc(bytes, final_size);
5420 }
5421 else {
5422 bytes2 = PyMem_Realloc(bytes, final_size);
5423 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005424 if (bytes2 == NULL) {
5425 if (error_pos != NULL) {
5426 *error_pos = (size_t)-1;
5427 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005428 if (raw_malloc) {
5429 PyMem_RawFree(bytes);
5430 }
5431 else {
5432 PyMem_Free(bytes);
5433 }
5434 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005435 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005436 *str = bytes2;
5437 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005438}
5439
5440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441/* Primary internal function which creates utf8 encoded bytes objects.
5442
5443 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005444 and allocate exactly as much space needed at the end. Else allocate the
5445 maximum possible needed (4 result bytes per Unicode character), and return
5446 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005447*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005448static PyObject *
5449unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5450 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 if (!PyUnicode_Check(unicode)) {
5453 PyErr_BadArgument();
5454 return NULL;
5455 }
5456
5457 if (PyUnicode_READY(unicode) == -1)
5458 return NULL;
5459
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005460 if (PyUnicode_UTF8(unicode))
5461 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5462 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463
Inada Naoki02a4d572020-02-27 13:48:59 +09005464 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005465 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005466 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5467
5468 _PyBytesWriter writer;
5469 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470
Benjamin Petersonead6b532011-12-20 17:23:42 -06005471 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005472 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005473 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005474 case PyUnicode_1BYTE_KIND:
5475 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5476 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005477 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5478 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005479 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005480 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5481 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005482 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005483 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5484 break;
Tim Peters602f7402002-04-27 18:03:26 +00005485 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005486
5487 if (end == NULL) {
5488 _PyBytesWriter_Dealloc(&writer);
5489 return NULL;
5490 }
5491 return _PyBytesWriter_Finish(&writer, end);
5492}
5493
5494static int
5495unicode_fill_utf8(PyObject *unicode)
5496{
5497 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5498 assert(!PyUnicode_IS_ASCII(unicode));
5499
5500 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005501 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005502 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5503
5504 _PyBytesWriter writer;
5505 char *end;
5506
5507 switch (kind) {
5508 default:
5509 Py_UNREACHABLE();
5510 case PyUnicode_1BYTE_KIND:
5511 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5512 _Py_ERROR_STRICT, NULL);
5513 break;
5514 case PyUnicode_2BYTE_KIND:
5515 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5516 _Py_ERROR_STRICT, NULL);
5517 break;
5518 case PyUnicode_4BYTE_KIND:
5519 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5520 _Py_ERROR_STRICT, NULL);
5521 break;
5522 }
5523 if (end == NULL) {
5524 _PyBytesWriter_Dealloc(&writer);
5525 return -1;
5526 }
5527
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005528 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005529 PyBytes_AS_STRING(writer.buffer);
5530 Py_ssize_t len = end - start;
5531
5532 char *cache = PyObject_MALLOC(len + 1);
5533 if (cache == NULL) {
5534 _PyBytesWriter_Dealloc(&writer);
5535 PyErr_NoMemory();
5536 return -1;
5537 }
5538 _PyUnicode_UTF8(unicode) = cache;
5539 _PyUnicode_UTF8_LENGTH(unicode) = len;
5540 memcpy(cache, start, len);
5541 cache[len] = '\0';
5542 _PyBytesWriter_Dealloc(&writer);
5543 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544}
5545
Alexander Belopolsky40018472011-02-26 01:02:56 +00005546PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005547_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5548{
5549 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5550}
5551
5552
5553PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005554PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5555 Py_ssize_t size,
5556 const char *errors)
5557{
5558 PyObject *v, *unicode;
5559
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005560 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561 if (unicode == NULL)
5562 return NULL;
5563 v = _PyUnicode_AsUTF8String(unicode, errors);
5564 Py_DECREF(unicode);
5565 return v;
5566}
5567
5568PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005569PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572}
5573
Walter Dörwald41980ca2007-08-16 21:55:45 +00005574/* --- UTF-32 Codec ------------------------------------------------------- */
5575
5576PyObject *
5577PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 Py_ssize_t size,
5579 const char *errors,
5580 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005581{
5582 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5583}
5584
5585PyObject *
5586PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 Py_ssize_t size,
5588 const char *errors,
5589 int *byteorder,
5590 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005591{
5592 const char *starts = s;
5593 Py_ssize_t startinpos;
5594 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005595 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005596 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005597 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005598 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005599 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005600 PyObject *errorHandler = NULL;
5601 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005602
Andy Lestere6be9b52020-02-11 20:28:35 -06005603 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005604 e = q + size;
5605
5606 if (byteorder)
5607 bo = *byteorder;
5608
5609 /* Check for BOM marks (U+FEFF) in the input and adjust current
5610 byte order setting accordingly. In native mode, the leading BOM
5611 mark is skipped, in all other modes, it is copied to the output
5612 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005613 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005614 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005615 if (bom == 0x0000FEFF) {
5616 bo = -1;
5617 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005619 else if (bom == 0xFFFE0000) {
5620 bo = 1;
5621 q += 4;
5622 }
5623 if (byteorder)
5624 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005625 }
5626
Victor Stinnere64322e2012-10-30 23:12:47 +01005627 if (q == e) {
5628 if (consumed)
5629 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005630 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005631 }
5632
Victor Stinnere64322e2012-10-30 23:12:47 +01005633#ifdef WORDS_BIGENDIAN
5634 le = bo < 0;
5635#else
5636 le = bo <= 0;
5637#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005638 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005639
Victor Stinner8f674cc2013-04-17 23:02:17 +02005640 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005641 writer.min_length = (e - q + 3) / 4;
5642 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005643 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005644
Victor Stinnere64322e2012-10-30 23:12:47 +01005645 while (1) {
5646 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005647 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005648
Victor Stinnere64322e2012-10-30 23:12:47 +01005649 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005650 enum PyUnicode_Kind kind = writer.kind;
5651 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005652 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005653 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005654 if (le) {
5655 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005656 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005657 if (ch > maxch)
5658 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005659 if (kind != PyUnicode_1BYTE_KIND &&
5660 Py_UNICODE_IS_SURROGATE(ch))
5661 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005662 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005663 q += 4;
5664 } while (q <= last);
5665 }
5666 else {
5667 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005668 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005669 if (ch > maxch)
5670 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 if (kind != PyUnicode_1BYTE_KIND &&
5672 Py_UNICODE_IS_SURROGATE(ch))
5673 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005674 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005675 q += 4;
5676 } while (q <= last);
5677 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005679 }
5680
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005681 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005682 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005683 startinpos = ((const char *)q) - starts;
5684 endinpos = startinpos + 4;
5685 }
5686 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005687 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005689 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005691 startinpos = ((const char *)q) - starts;
5692 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005694 else {
5695 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005696 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005697 goto onError;
5698 q += 4;
5699 continue;
5700 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005701 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005702 startinpos = ((const char *)q) - starts;
5703 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005705
5706 /* The remaining input chars are ignored if the callback
5707 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005708 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005712 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005714 }
5715
Walter Dörwald41980ca2007-08-16 21:55:45 +00005716 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005718
Walter Dörwald41980ca2007-08-16 21:55:45 +00005719 Py_XDECREF(errorHandler);
5720 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005721 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005722
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005724 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005725 Py_XDECREF(errorHandler);
5726 Py_XDECREF(exc);
5727 return NULL;
5728}
5729
5730PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005731_PyUnicode_EncodeUTF32(PyObject *str,
5732 const char *errors,
5733 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005734{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005735 enum PyUnicode_Kind kind;
5736 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005738 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005739 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005740#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005741 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005742#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005743 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005744#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005745 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005746 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005747 PyObject *errorHandler = NULL;
5748 PyObject *exc = NULL;
5749 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005750
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005751 if (!PyUnicode_Check(str)) {
5752 PyErr_BadArgument();
5753 return NULL;
5754 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005755 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005756 return NULL;
5757 kind = PyUnicode_KIND(str);
5758 data = PyUnicode_DATA(str);
5759 len = PyUnicode_GET_LENGTH(str);
5760
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005761 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005762 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005763 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005764 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005765 if (v == NULL)
5766 return NULL;
5767
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005768 /* output buffer is 4-bytes aligned */
5769 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005770 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005771 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005772 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005773 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005774 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005775
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005776 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005777 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005778 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005779 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 else
5781 encoding = "utf-32";
5782
5783 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005784 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5785 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005786 }
5787
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005788 pos = 0;
5789 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005791
5792 if (kind == PyUnicode_2BYTE_KIND) {
5793 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5794 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005795 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005796 else {
5797 assert(kind == PyUnicode_4BYTE_KIND);
5798 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5799 &out, native_ordering);
5800 }
5801 if (pos == len)
5802 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005803
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005804 rep = unicode_encode_call_errorhandler(
5805 errors, &errorHandler,
5806 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005807 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 if (!rep)
5809 goto error;
5810
5811 if (PyBytes_Check(rep)) {
5812 repsize = PyBytes_GET_SIZE(rep);
5813 if (repsize & 3) {
5814 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005815 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005816 "surrogates not allowed");
5817 goto error;
5818 }
5819 moreunits = repsize / 4;
5820 }
5821 else {
5822 assert(PyUnicode_Check(rep));
5823 if (PyUnicode_READY(rep) < 0)
5824 goto error;
5825 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5826 if (!PyUnicode_IS_ASCII(rep)) {
5827 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005828 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005829 "surrogates not allowed");
5830 goto error;
5831 }
5832 }
5833
5834 /* four bytes are reserved for each surrogate */
5835 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005836 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005837 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 /* integer overflow */
5839 PyErr_NoMemory();
5840 goto error;
5841 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005842 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005844 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005845 }
5846
5847 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005848 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005849 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005850 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005851 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005852 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5853 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005854 }
5855
5856 Py_CLEAR(rep);
5857 }
5858
5859 /* Cut back to size actually needed. This is necessary for, for example,
5860 encoding of a string containing isolated surrogates and the 'ignore'
5861 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005862 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005863 if (nsize != PyBytes_GET_SIZE(v))
5864 _PyBytes_Resize(&v, nsize);
5865 Py_XDECREF(errorHandler);
5866 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005867 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005868 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005869 error:
5870 Py_XDECREF(rep);
5871 Py_XDECREF(errorHandler);
5872 Py_XDECREF(exc);
5873 Py_XDECREF(v);
5874 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005875}
5876
Alexander Belopolsky40018472011-02-26 01:02:56 +00005877PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5879 Py_ssize_t size,
5880 const char *errors,
5881 int byteorder)
5882{
5883 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005884 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 if (tmp == NULL)
5886 return NULL;
5887 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5888 Py_DECREF(tmp);
5889 return result;
5890}
5891
5892PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005893PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005894{
Victor Stinnerb960b342011-11-20 19:12:52 +01005895 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005896}
5897
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898/* --- UTF-16 Codec ------------------------------------------------------- */
5899
Tim Peters772747b2001-08-09 22:21:55 +00005900PyObject *
5901PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 Py_ssize_t size,
5903 const char *errors,
5904 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Walter Dörwald69652032004-09-07 20:24:22 +00005906 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5907}
5908
5909PyObject *
5910PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 Py_ssize_t size,
5912 const char *errors,
5913 int *byteorder,
5914 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005917 Py_ssize_t startinpos;
5918 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005919 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005920 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005921 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005922 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005923 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 PyObject *errorHandler = NULL;
5925 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005926 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
Andy Lestere6be9b52020-02-11 20:28:35 -06005928 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005929 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930
5931 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005932 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005934 /* Check for BOM marks (U+FEFF) in the input and adjust current
5935 byte order setting accordingly. In native mode, the leading BOM
5936 mark is skipped, in all other modes, it is copied to the output
5937 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005938 if (bo == 0 && size >= 2) {
5939 const Py_UCS4 bom = (q[1] << 8) | q[0];
5940 if (bom == 0xFEFF) {
5941 q += 2;
5942 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005944 else if (bom == 0xFFFE) {
5945 q += 2;
5946 bo = 1;
5947 }
5948 if (byteorder)
5949 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005950 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
Antoine Pitrou63065d72012-05-15 23:48:04 +02005952 if (q == e) {
5953 if (consumed)
5954 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005955 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005956 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005957
Christian Heimes743e0cd2012-10-17 23:52:17 +02005958#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005959 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005960 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005961#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005962 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005963 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005964#endif
Tim Peters772747b2001-08-09 22:21:55 +00005965
Antoine Pitrou63065d72012-05-15 23:48:04 +02005966 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005967 character count normally. Error handler will take care of
5968 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005969 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005970 writer.min_length = (e - q + 1) / 2;
5971 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005972 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005973
Antoine Pitrou63065d72012-05-15 23:48:04 +02005974 while (1) {
5975 Py_UCS4 ch = 0;
5976 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005977 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005978 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005979 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005980 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005981 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005982 native_ordering);
5983 else
5984 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005985 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005986 native_ordering);
5987 } else if (kind == PyUnicode_2BYTE_KIND) {
5988 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005989 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005990 native_ordering);
5991 } else {
5992 assert(kind == PyUnicode_4BYTE_KIND);
5993 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005994 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005995 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005996 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005997 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998
Antoine Pitrou63065d72012-05-15 23:48:04 +02005999 switch (ch)
6000 {
6001 case 0:
6002 /* remaining byte at the end? (size should be even) */
6003 if (q == e || consumed)
6004 goto End;
6005 errmsg = "truncated data";
6006 startinpos = ((const char *)q) - starts;
6007 endinpos = ((const char *)e) - starts;
6008 break;
6009 /* The remaining input chars are ignored if the callback
6010 chooses to skip the input */
6011 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006012 q -= 2;
6013 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006014 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006015 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006016 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006017 endinpos = ((const char *)e) - starts;
6018 break;
6019 case 2:
6020 errmsg = "illegal encoding";
6021 startinpos = ((const char *)q) - 2 - starts;
6022 endinpos = startinpos + 2;
6023 break;
6024 case 3:
6025 errmsg = "illegal UTF-16 surrogate";
6026 startinpos = ((const char *)q) - 4 - starts;
6027 endinpos = startinpos + 2;
6028 break;
6029 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006030 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006031 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 continue;
6033 }
6034
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006035 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006036 errors,
6037 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006038 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006039 &starts,
6040 (const char **)&e,
6041 &startinpos,
6042 &endinpos,
6043 &exc,
6044 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006045 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 }
6048
Antoine Pitrou63065d72012-05-15 23:48:04 +02006049End:
Walter Dörwald69652032004-09-07 20:24:22 +00006050 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 Py_XDECREF(errorHandler);
6054 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 Py_XDECREF(errorHandler);
6060 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 return NULL;
6062}
6063
Tim Peters772747b2001-08-09 22:21:55 +00006064PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065_PyUnicode_EncodeUTF16(PyObject *str,
6066 const char *errors,
6067 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006069 enum PyUnicode_Kind kind;
6070 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006071 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006072 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006073 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006074 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006075#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006076 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006077#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006078 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006079#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006080 const char *encoding;
6081 Py_ssize_t nsize, pos;
6082 PyObject *errorHandler = NULL;
6083 PyObject *exc = NULL;
6084 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006085
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006086 if (!PyUnicode_Check(str)) {
6087 PyErr_BadArgument();
6088 return NULL;
6089 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006090 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006091 return NULL;
6092 kind = PyUnicode_KIND(str);
6093 data = PyUnicode_DATA(str);
6094 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006095
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006096 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006097 if (kind == PyUnicode_4BYTE_KIND) {
6098 const Py_UCS4 *in = (const Py_UCS4 *)data;
6099 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006100 while (in < end) {
6101 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006103 }
6104 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006105 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006106 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006108 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006109 nsize = len + pairs + (byteorder == 0);
6110 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006111 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006115 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006116 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006117 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006118 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006119 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006120 }
6121 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006122 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006123 }
Tim Peters772747b2001-08-09 22:21:55 +00006124
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006125 if (kind == PyUnicode_1BYTE_KIND) {
6126 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6127 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006128 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006129
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006130 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006131 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006132 }
6133 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006134 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006135 }
6136 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006137 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006138 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006139
6140 pos = 0;
6141 while (pos < len) {
6142 Py_ssize_t repsize, moreunits;
6143
6144 if (kind == PyUnicode_2BYTE_KIND) {
6145 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6146 &out, native_ordering);
6147 }
6148 else {
6149 assert(kind == PyUnicode_4BYTE_KIND);
6150 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6151 &out, native_ordering);
6152 }
6153 if (pos == len)
6154 break;
6155
6156 rep = unicode_encode_call_errorhandler(
6157 errors, &errorHandler,
6158 encoding, "surrogates not allowed",
6159 str, &exc, pos, pos + 1, &pos);
6160 if (!rep)
6161 goto error;
6162
6163 if (PyBytes_Check(rep)) {
6164 repsize = PyBytes_GET_SIZE(rep);
6165 if (repsize & 1) {
6166 raise_encode_exception(&exc, encoding,
6167 str, pos - 1, pos,
6168 "surrogates not allowed");
6169 goto error;
6170 }
6171 moreunits = repsize / 2;
6172 }
6173 else {
6174 assert(PyUnicode_Check(rep));
6175 if (PyUnicode_READY(rep) < 0)
6176 goto error;
6177 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6178 if (!PyUnicode_IS_ASCII(rep)) {
6179 raise_encode_exception(&exc, encoding,
6180 str, pos - 1, pos,
6181 "surrogates not allowed");
6182 goto error;
6183 }
6184 }
6185
6186 /* two bytes are reserved for each surrogate */
6187 if (moreunits > 1) {
6188 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006189 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006190 /* integer overflow */
6191 PyErr_NoMemory();
6192 goto error;
6193 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006194 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006195 goto error;
6196 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6197 }
6198
6199 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006200 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006201 out += moreunits;
6202 } else /* rep is unicode */ {
6203 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6204 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6205 &out, native_ordering);
6206 }
6207
6208 Py_CLEAR(rep);
6209 }
6210
6211 /* Cut back to size actually needed. This is necessary for, for example,
6212 encoding of a string containing isolated surrogates and the 'ignore' handler
6213 is used. */
6214 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6215 if (nsize != PyBytes_GET_SIZE(v))
6216 _PyBytes_Resize(&v, nsize);
6217 Py_XDECREF(errorHandler);
6218 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006219 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006220 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006221 error:
6222 Py_XDECREF(rep);
6223 Py_XDECREF(errorHandler);
6224 Py_XDECREF(exc);
6225 Py_XDECREF(v);
6226 return NULL;
6227#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228}
6229
Alexander Belopolsky40018472011-02-26 01:02:56 +00006230PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006231PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6232 Py_ssize_t size,
6233 const char *errors,
6234 int byteorder)
6235{
6236 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006237 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006238 if (tmp == NULL)
6239 return NULL;
6240 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6241 Py_DECREF(tmp);
6242 return result;
6243}
6244
6245PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006246PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006248 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249}
6250
6251/* --- Unicode Escape Codec ----------------------------------------------- */
6252
Fredrik Lundh06d12682001-01-24 07:59:11 +00006253static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006254
Alexander Belopolsky40018472011-02-26 01:02:56 +00006255PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006256_PyUnicode_DecodeUnicodeEscape(const char *s,
6257 Py_ssize_t size,
6258 const char *errors,
6259 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006262 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006264 PyObject *errorHandler = NULL;
6265 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006266
Eric V. Smith42454af2016-10-31 09:22:08 -04006267 // so we can remember if we've seen an invalid escape char or not
6268 *first_invalid_escape = NULL;
6269
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006271 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 }
6273 /* Escaped strings will always be longer than the resulting
6274 Unicode string, so we start with size here and then reduce the
6275 length after conversion to the true value.
6276 (but if the error callback returns a long replacement string
6277 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006278 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006279 writer.min_length = size;
6280 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6281 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006282 }
6283
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 end = s + size;
6285 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 unsigned char c = (unsigned char) *s++;
6287 Py_UCS4 ch;
6288 int count;
6289 Py_ssize_t startinpos;
6290 Py_ssize_t endinpos;
6291 const char *message;
6292
6293#define WRITE_ASCII_CHAR(ch) \
6294 do { \
6295 assert(ch <= 127); \
6296 assert(writer.pos < writer.size); \
6297 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6298 } while(0)
6299
6300#define WRITE_CHAR(ch) \
6301 do { \
6302 if (ch <= writer.maxchar) { \
6303 assert(writer.pos < writer.size); \
6304 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6305 } \
6306 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6307 goto onError; \
6308 } \
6309 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
6311 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 if (c != '\\') {
6313 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 continue;
6315 }
6316
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 if (s >= end) {
6320 message = "\\ at end of string";
6321 goto error;
6322 }
6323 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006324
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006326 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 case '\n': continue;
6330 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6331 case '\'': WRITE_ASCII_CHAR('\''); continue;
6332 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6333 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006334 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6336 case 't': WRITE_ASCII_CHAR('\t'); continue;
6337 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6338 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006339 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006341 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 case '0': case '1': case '2': case '3':
6346 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006348 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 ch = (ch<<3) + *s++ - '0';
6350 if (s < end && '0' <= *s && *s <= '7') {
6351 ch = (ch<<3) + *s++ - '0';
6352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006354 WRITE_CHAR(ch);
6355 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 /* hex escapes */
6358 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006360 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006361 message = "truncated \\xXX escape";
6362 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006367 message = "truncated \\uXXXX escape";
6368 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006371 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006373 message = "truncated \\UXXXXXXXX escape";
6374 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006375 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006376 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 ch <<= 4;
6378 if (c >= '0' && c <= '9') {
6379 ch += c - '0';
6380 }
6381 else if (c >= 'a' && c <= 'f') {
6382 ch += c - ('a' - 10);
6383 }
6384 else if (c >= 'A' && c <= 'F') {
6385 ch += c - ('A' - 10);
6386 }
6387 else {
6388 break;
6389 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006390 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006391 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006392 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006393 }
6394
6395 /* when we get here, ch is a 32-bit unicode character */
6396 if (ch > MAX_UNICODE) {
6397 message = "illegal Unicode character";
6398 goto error;
6399 }
6400
6401 WRITE_CHAR(ch);
6402 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006403
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006405 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006406 if (ucnhash_CAPI == NULL) {
6407 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6409 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 if (ucnhash_CAPI == NULL) {
6411 PyErr_SetString(
6412 PyExc_UnicodeError,
6413 "\\N escapes not supported (can't load unicodedata module)"
6414 );
6415 goto onError;
6416 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006417 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006418
6419 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006420 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 const char *start = ++s;
6422 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006423 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006425 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 namelen = s - start;
6427 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006428 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006429 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 ch = 0xffffffff; /* in case 'getcode' messes up */
6431 if (namelen <= INT_MAX &&
6432 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6433 &ch, 0)) {
6434 assert(ch <= MAX_UNICODE);
6435 WRITE_CHAR(ch);
6436 continue;
6437 }
6438 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006439 }
6440 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006441 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006442
6443 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006444 if (*first_invalid_escape == NULL) {
6445 *first_invalid_escape = s-1; /* Back up one char, since we've
6446 already incremented s. */
6447 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 WRITE_ASCII_CHAR('\\');
6449 WRITE_CHAR(c);
6450 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006452
6453 error:
6454 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006455 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006456 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006457 errors, &errorHandler,
6458 "unicodeescape", message,
6459 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006461 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006462 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006463 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006464
6465#undef WRITE_ASCII_CHAR
6466#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006468
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006469 Py_XDECREF(errorHandler);
6470 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006471 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006472
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006474 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475 Py_XDECREF(errorHandler);
6476 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 return NULL;
6478}
6479
Eric V. Smith42454af2016-10-31 09:22:08 -04006480PyObject *
6481PyUnicode_DecodeUnicodeEscape(const char *s,
6482 Py_ssize_t size,
6483 const char *errors)
6484{
6485 const char *first_invalid_escape;
6486 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6487 &first_invalid_escape);
6488 if (result == NULL)
6489 return NULL;
6490 if (first_invalid_escape != NULL) {
6491 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6492 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006493 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006494 Py_DECREF(result);
6495 return NULL;
6496 }
6497 }
6498 return result;
6499}
6500
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006501/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502
Alexander Belopolsky40018472011-02-26 01:02:56 +00006503PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006504PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006506 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006507 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006509 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006510 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006511 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512
Ezio Melottie7f90372012-10-05 03:33:31 +03006513 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006514 escape.
6515
Ezio Melottie7f90372012-10-05 03:33:31 +03006516 For UCS1 strings it's '\xxx', 4 bytes per source character.
6517 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6518 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006519 */
6520
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006521 if (!PyUnicode_Check(unicode)) {
6522 PyErr_BadArgument();
6523 return NULL;
6524 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006526 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006527 }
Victor Stinner358af132015-10-12 22:36:57 +02006528
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006529 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006530 if (len == 0) {
6531 return PyBytes_FromStringAndSize(NULL, 0);
6532 }
6533
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006534 kind = PyUnicode_KIND(unicode);
6535 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006536 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6537 bytes, and 1 byte characters 4. */
6538 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006539 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006540 return PyErr_NoMemory();
6541 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006542 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006543 if (repr == NULL) {
6544 return NULL;
6545 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006546
Victor Stinner62ec3312016-09-06 17:04:34 -07006547 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006548 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006549 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006550
Victor Stinner62ec3312016-09-06 17:04:34 -07006551 /* U+0000-U+00ff range */
6552 if (ch < 0x100) {
6553 if (ch >= ' ' && ch < 127) {
6554 if (ch != '\\') {
6555 /* Copy printable US ASCII as-is */
6556 *p++ = (char) ch;
6557 }
6558 /* Escape backslashes */
6559 else {
6560 *p++ = '\\';
6561 *p++ = '\\';
6562 }
6563 }
Victor Stinner358af132015-10-12 22:36:57 +02006564
Victor Stinner62ec3312016-09-06 17:04:34 -07006565 /* Map special whitespace to '\t', \n', '\r' */
6566 else if (ch == '\t') {
6567 *p++ = '\\';
6568 *p++ = 't';
6569 }
6570 else if (ch == '\n') {
6571 *p++ = '\\';
6572 *p++ = 'n';
6573 }
6574 else if (ch == '\r') {
6575 *p++ = '\\';
6576 *p++ = 'r';
6577 }
6578
6579 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6580 else {
6581 *p++ = '\\';
6582 *p++ = 'x';
6583 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6584 *p++ = Py_hexdigits[ch & 0x000F];
6585 }
Tim Petersced69f82003-09-16 20:30:58 +00006586 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006587 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 *p++ = '\\';
6590 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006591 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6592 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6593 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6594 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006596 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6597 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006598
Victor Stinner62ec3312016-09-06 17:04:34 -07006599 /* Make sure that the first two digits are zero */
6600 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006601 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006602 *p++ = 'U';
6603 *p++ = '0';
6604 *p++ = '0';
6605 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6606 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6607 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6608 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6609 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6610 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006611 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613
Victor Stinner62ec3312016-09-06 17:04:34 -07006614 assert(p - PyBytes_AS_STRING(repr) > 0);
6615 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6616 return NULL;
6617 }
6618 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619}
6620
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006622PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6623 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006625 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006626 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006627 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 }
6630
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006631 result = PyUnicode_AsUnicodeEscapeString(tmp);
6632 Py_DECREF(tmp);
6633 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634}
6635
6636/* --- Raw Unicode Escape Codec ------------------------------------------- */
6637
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638PyObject *
6639PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006640 Py_ssize_t size,
6641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006644 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 PyObject *errorHandler = NULL;
6647 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006648
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006650 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006652
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 /* Escaped strings will always be longer than the resulting
6654 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 length after conversion to the true value. (But decoding error
6656 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006657 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006658 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006659 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6660 goto onError;
6661 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006662
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 end = s + size;
6664 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006665 unsigned char c = (unsigned char) *s++;
6666 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006667 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006668 Py_ssize_t startinpos;
6669 Py_ssize_t endinpos;
6670 const char *message;
6671
6672#define WRITE_CHAR(ch) \
6673 do { \
6674 if (ch <= writer.maxchar) { \
6675 assert(writer.pos < writer.size); \
6676 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6677 } \
6678 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6679 goto onError; \
6680 } \
6681 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006684 if (c != '\\' || s >= end) {
6685 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006687 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006688
Victor Stinner62ec3312016-09-06 17:04:34 -07006689 c = (unsigned char) *s++;
6690 if (c == 'u') {
6691 count = 4;
6692 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006694 else if (c == 'U') {
6695 count = 8;
6696 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006697 }
6698 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006699 assert(writer.pos < writer.size);
6700 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6701 WRITE_CHAR(c);
6702 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006703 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006704 startinpos = s - starts - 2;
6705
6706 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6707 for (ch = 0; count && s < end; ++s, --count) {
6708 c = (unsigned char)*s;
6709 ch <<= 4;
6710 if (c >= '0' && c <= '9') {
6711 ch += c - '0';
6712 }
6713 else if (c >= 'a' && c <= 'f') {
6714 ch += c - ('a' - 10);
6715 }
6716 else if (c >= 'A' && c <= 'F') {
6717 ch += c - ('A' - 10);
6718 }
6719 else {
6720 break;
6721 }
6722 }
6723 if (!count) {
6724 if (ch <= MAX_UNICODE) {
6725 WRITE_CHAR(ch);
6726 continue;
6727 }
6728 message = "\\Uxxxxxxxx out of range";
6729 }
6730
6731 endinpos = s-starts;
6732 writer.min_length = end - s + writer.pos;
6733 if (unicode_decode_call_errorhandler_writer(
6734 errors, &errorHandler,
6735 "rawunicodeescape", message,
6736 &starts, &end, &startinpos, &endinpos, &exc, &s,
6737 &writer)) {
6738 goto onError;
6739 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006740 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006741
6742#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 Py_XDECREF(errorHandler);
6745 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006746 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006747
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006749 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 Py_XDECREF(errorHandler);
6751 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754}
6755
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006756
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006758PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759{
Victor Stinner62ec3312016-09-06 17:04:34 -07006760 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006762 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006763 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006764 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006765 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006767 if (!PyUnicode_Check(unicode)) {
6768 PyErr_BadArgument();
6769 return NULL;
6770 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006771 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006772 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006773 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006774 kind = PyUnicode_KIND(unicode);
6775 data = PyUnicode_DATA(unicode);
6776 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006777 if (kind == PyUnicode_1BYTE_KIND) {
6778 return PyBytes_FromStringAndSize(data, len);
6779 }
Victor Stinner0e368262011-11-10 20:12:49 +01006780
Victor Stinner62ec3312016-09-06 17:04:34 -07006781 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6782 bytes, and 1 byte characters 4. */
6783 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006784
Victor Stinner62ec3312016-09-06 17:04:34 -07006785 if (len > PY_SSIZE_T_MAX / expandsize) {
6786 return PyErr_NoMemory();
6787 }
6788 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6789 if (repr == NULL) {
6790 return NULL;
6791 }
6792 if (len == 0) {
6793 return repr;
6794 }
6795
6796 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006797 for (pos = 0; pos < len; pos++) {
6798 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006799
Victor Stinner62ec3312016-09-06 17:04:34 -07006800 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6801 if (ch < 0x100) {
6802 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006803 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006804 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006805 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 *p++ = '\\';
6807 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006808 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6809 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6810 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6811 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006813 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6814 else {
6815 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6816 *p++ = '\\';
6817 *p++ = 'U';
6818 *p++ = '0';
6819 *p++ = '0';
6820 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6821 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6822 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6823 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6824 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6825 *p++ = Py_hexdigits[ch & 15];
6826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006828
Victor Stinner62ec3312016-09-06 17:04:34 -07006829 assert(p > PyBytes_AS_STRING(repr));
6830 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6831 return NULL;
6832 }
6833 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834}
6835
Alexander Belopolsky40018472011-02-26 01:02:56 +00006836PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006837PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6838 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006840 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006841 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006842 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006843 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006844 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6845 Py_DECREF(tmp);
6846 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847}
6848
6849/* --- Latin-1 Codec ------------------------------------------------------ */
6850
Alexander Belopolsky40018472011-02-26 01:02:56 +00006851PyObject *
6852PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006853 Py_ssize_t size,
6854 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006857 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858}
6859
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006860/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006861static void
6862make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006863 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006864 PyObject *unicode,
6865 Py_ssize_t startpos, Py_ssize_t endpos,
6866 const char *reason)
6867{
6868 if (*exceptionObject == NULL) {
6869 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006870 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006871 encoding, unicode, startpos, endpos, reason);
6872 }
6873 else {
6874 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6875 goto onError;
6876 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6877 goto onError;
6878 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6879 goto onError;
6880 return;
6881 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006882 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006883 }
6884}
6885
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006887static void
6888raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006889 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006890 PyObject *unicode,
6891 Py_ssize_t startpos, Py_ssize_t endpos,
6892 const char *reason)
6893{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006894 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006895 encoding, unicode, startpos, endpos, reason);
6896 if (*exceptionObject != NULL)
6897 PyCodec_StrictErrors(*exceptionObject);
6898}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899
6900/* error handling callback helper:
6901 build arguments, call the callback and check the arguments,
6902 put the result into newpos and return the replacement string, which
6903 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006904static PyObject *
6905unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006906 PyObject **errorHandler,
6907 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006909 Py_ssize_t startpos, Py_ssize_t endpos,
6910 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006912 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006913 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 PyObject *restuple;
6915 PyObject *resunicode;
6916
6917 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006921 }
6922
Benjamin Petersonbac79492012-01-14 13:34:47 -05006923 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924 return NULL;
6925 len = PyUnicode_GET_LENGTH(unicode);
6926
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006927 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006928 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931
Petr Viktorinffd97532020-02-11 17:46:57 +01006932 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006933 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006935 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006936 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 Py_DECREF(restuple);
6938 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006940 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 &resunicode, newpos)) {
6942 Py_DECREF(restuple);
6943 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006944 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006945 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6946 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6947 Py_DECREF(restuple);
6948 return NULL;
6949 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006950 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006951 *newpos = len + *newpos;
6952 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006953 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 Py_DECREF(restuple);
6955 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006957 Py_INCREF(resunicode);
6958 Py_DECREF(restuple);
6959 return resunicode;
6960}
6961
Alexander Belopolsky40018472011-02-26 01:02:56 +00006962static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006963unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006964 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006965 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006967 /* input state */
6968 Py_ssize_t pos=0, size;
6969 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006970 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006971 /* pointer into the output */
6972 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006973 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6974 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006975 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006976 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006977 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006978 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006979 /* output object */
6980 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006981
Benjamin Petersonbac79492012-01-14 13:34:47 -05006982 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006983 return NULL;
6984 size = PyUnicode_GET_LENGTH(unicode);
6985 kind = PyUnicode_KIND(unicode);
6986 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006987 /* allocate enough for a simple encoding without
6988 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006989 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006990 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006991
6992 _PyBytesWriter_Init(&writer);
6993 str = _PyBytesWriter_Alloc(&writer, size);
6994 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006995 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006996
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006997 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006998 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006999
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007001 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007002 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007003 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007004 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007007 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007009 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007010 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007012
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007013 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007015
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007016 /* Only overallocate the buffer if it's not the last write */
7017 writer.overallocate = (collend < size);
7018
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007020 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007021 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007022
7023 switch (error_handler) {
7024 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007025 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007027
7028 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007029 memset(str, '?', collend - collstart);
7030 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007031 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007032 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007033 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 break;
Victor Stinner50149202015-09-22 00:26:54 +02007035
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007036 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007037 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007038 writer.min_size -= (collend - collstart);
7039 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007040 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007041 if (str == NULL)
7042 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007043 pos = collend;
7044 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007045
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007046 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007047 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007048 writer.min_size -= (collend - collstart);
7049 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007050 unicode, collstart, collend);
7051 if (str == NULL)
7052 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007053 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 break;
Victor Stinner50149202015-09-22 00:26:54 +02007055
Victor Stinnerc3713e92015-09-29 12:32:13 +02007056 case _Py_ERROR_SURROGATEESCAPE:
7057 for (i = collstart; i < collend; ++i) {
7058 ch = PyUnicode_READ(kind, data, i);
7059 if (ch < 0xdc80 || 0xdcff < ch) {
7060 /* Not a UTF-8b surrogate */
7061 break;
7062 }
7063 *str++ = (char)(ch - 0xdc00);
7064 ++pos;
7065 }
7066 if (i >= collend)
7067 break;
7068 collstart = pos;
7069 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007070 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007071
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007073 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7074 encoding, reason, unicode, &exc,
7075 collstart, collend, &newpos);
7076 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007078
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007079 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007080 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007081
Victor Stinner6bd525b2015-10-09 13:10:05 +02007082 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007083 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007084 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007085 PyBytes_AS_STRING(rep),
7086 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007087 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007088 else {
7089 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007090
Victor Stinner6bd525b2015-10-09 13:10:05 +02007091 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007093
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007094 if (limit == 256 ?
7095 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7096 !PyUnicode_IS_ASCII(rep))
7097 {
7098 /* Not all characters are smaller than limit */
7099 raise_encode_exception(&exc, encoding, unicode,
7100 collstart, collend, reason);
7101 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007103 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7104 str = _PyBytesWriter_WriteBytes(&writer, str,
7105 PyUnicode_DATA(rep),
7106 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007108 if (str == NULL)
7109 goto onError;
7110
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007111 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007112 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007113 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007114
7115 /* If overallocation was disabled, ensure that it was the last
7116 write. Otherwise, we missed an optimization */
7117 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007118 }
7119 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007120
Victor Stinner50149202015-09-22 00:26:54 +02007121 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007122 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007123 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007124
7125 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007126 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007127 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007128 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007129 Py_XDECREF(exc);
7130 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007131}
7132
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007133/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007134PyObject *
7135PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007136 Py_ssize_t size,
7137 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007139 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007140 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007141 if (unicode == NULL)
7142 return NULL;
7143 result = unicode_encode_ucs1(unicode, errors, 256);
7144 Py_DECREF(unicode);
7145 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146}
7147
Alexander Belopolsky40018472011-02-26 01:02:56 +00007148PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007149_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150{
7151 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 PyErr_BadArgument();
7153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007155 if (PyUnicode_READY(unicode) == -1)
7156 return NULL;
7157 /* Fast path: if it is a one-byte string, construct
7158 bytes object directly. */
7159 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7160 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7161 PyUnicode_GET_LENGTH(unicode));
7162 /* Non-Latin-1 characters present. Defer to above function to
7163 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007164 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007165}
7166
7167PyObject*
7168PyUnicode_AsLatin1String(PyObject *unicode)
7169{
7170 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
7172
7173/* --- 7-bit ASCII Codec -------------------------------------------------- */
7174
Alexander Belopolsky40018472011-02-26 01:02:56 +00007175PyObject *
7176PyUnicode_DecodeASCII(const char *s,
7177 Py_ssize_t size,
7178 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007181 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007182 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007183 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007184 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007185
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007187 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007188
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007190 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007191 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007193
Inada Naoki770847a2019-06-24 12:30:24 +09007194 // Shortcut for simple case
7195 PyObject *u = PyUnicode_New(size, 127);
7196 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007197 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007198 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007199 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007200 if (outpos == size) {
7201 return u;
7202 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007203
Inada Naoki770847a2019-06-24 12:30:24 +09007204 _PyUnicodeWriter writer;
7205 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007206 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007207
Inada Naoki770847a2019-06-24 12:30:24 +09007208 s += outpos;
7209 int kind = writer.kind;
7210 void *data = writer.data;
7211 Py_ssize_t startinpos, endinpos;
7212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007214 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007216 PyUnicode_WRITE(kind, data, writer.pos, c);
7217 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007219 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007221
7222 /* byte outsize range 0x00..0x7f: call the error handler */
7223
7224 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007225 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007226
7227 switch (error_handler)
7228 {
7229 case _Py_ERROR_REPLACE:
7230 case _Py_ERROR_SURROGATEESCAPE:
7231 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007232 but we may switch to UCS2 at the first write */
7233 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7234 goto onError;
7235 kind = writer.kind;
7236 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007237
7238 if (error_handler == _Py_ERROR_REPLACE)
7239 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7240 else
7241 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7242 writer.pos++;
7243 ++s;
7244 break;
7245
7246 case _Py_ERROR_IGNORE:
7247 ++s;
7248 break;
7249
7250 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 startinpos = s-starts;
7252 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007253 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007254 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 "ascii", "ordinal not in range(128)",
7256 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007257 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007259 kind = writer.kind;
7260 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007263 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007265 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007266
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007268 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007269 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271 return NULL;
7272}
7273
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007274/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007275PyObject *
7276PyUnicode_EncodeASCII(const Py_UNICODE *p,
7277 Py_ssize_t size,
7278 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007280 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007281 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007282 if (unicode == NULL)
7283 return NULL;
7284 result = unicode_encode_ucs1(unicode, errors, 128);
7285 Py_DECREF(unicode);
7286 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287}
7288
Alexander Belopolsky40018472011-02-26 01:02:56 +00007289PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291{
7292 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 PyErr_BadArgument();
7294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007296 if (PyUnicode_READY(unicode) == -1)
7297 return NULL;
7298 /* Fast path: if it is an ASCII-only string, construct bytes object
7299 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007300 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007301 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7302 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007303 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007304}
7305
7306PyObject *
7307PyUnicode_AsASCIIString(PyObject *unicode)
7308{
7309 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310}
7311
Steve Dowercc16be82016-09-08 10:35:16 -07007312#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007313
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007314/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007315
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007316#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007317#define NEED_RETRY
7318#endif
7319
Steve Dower7ebdda02019-08-21 16:22:33 -07007320/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7321 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7322 both cases also and avoids partial characters overrunning the
7323 length limit in MultiByteToWideChar on Windows */
7324#define DECODING_CHUNK_SIZE (INT_MAX/4)
7325
Victor Stinner3a50e702011-10-18 21:21:00 +02007326#ifndef WC_ERR_INVALID_CHARS
7327# define WC_ERR_INVALID_CHARS 0x0080
7328#endif
7329
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007330static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007331code_page_name(UINT code_page, PyObject **obj)
7332{
7333 *obj = NULL;
7334 if (code_page == CP_ACP)
7335 return "mbcs";
7336 if (code_page == CP_UTF7)
7337 return "CP_UTF7";
7338 if (code_page == CP_UTF8)
7339 return "CP_UTF8";
7340
7341 *obj = PyBytes_FromFormat("cp%u", code_page);
7342 if (*obj == NULL)
7343 return NULL;
7344 return PyBytes_AS_STRING(*obj);
7345}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007346
Victor Stinner3a50e702011-10-18 21:21:00 +02007347static DWORD
7348decode_code_page_flags(UINT code_page)
7349{
7350 if (code_page == CP_UTF7) {
7351 /* The CP_UTF7 decoder only supports flags=0 */
7352 return 0;
7353 }
7354 else
7355 return MB_ERR_INVALID_CHARS;
7356}
7357
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 * Decode a byte string from a Windows code page into unicode object in strict
7360 * mode.
7361 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007362 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7363 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007365static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007366decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007367 wchar_t **buf,
7368 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 const char *in,
7370 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007372 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007373 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375
7376 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007378 while ((outsize = MultiByteToWideChar(code_page, flags,
7379 in, insize, NULL, 0)) <= 0)
7380 {
7381 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7382 goto error;
7383 }
7384 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7385 flags = 0;
7386 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007388 /* Extend a wchar_t* buffer */
7389 Py_ssize_t n = *bufsize; /* Get the current length */
7390 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7391 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007393 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007394
7395 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7397 if (outsize <= 0)
7398 goto error;
7399 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007400
Victor Stinner3a50e702011-10-18 21:21:00 +02007401error:
7402 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7403 return -2;
7404 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007405 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406}
7407
Victor Stinner3a50e702011-10-18 21:21:00 +02007408/*
7409 * Decode a byte string from a code page into unicode object with an error
7410 * handler.
7411 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007412 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 * UnicodeDecodeError exception and returns -1 on error.
7414 */
7415static int
7416decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007417 wchar_t **buf,
7418 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007419 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007420 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007421{
7422 const char *startin = in;
7423 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007424 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 /* Ideally, we should get reason from FormatMessage. This is the Windows
7426 2000 English version of the message. */
7427 const char *reason = "No mapping for the Unicode character exists "
7428 "in the target code page.";
7429 /* each step cannot decode more than 1 character, but a character can be
7430 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007431 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007432 int insize;
7433 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 PyObject *errorHandler = NULL;
7435 PyObject *exc = NULL;
7436 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007437 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 DWORD err;
7439 int ret = -1;
7440
7441 assert(size > 0);
7442
7443 encoding = code_page_name(code_page, &encoding_obj);
7444 if (encoding == NULL)
7445 return -1;
7446
Victor Stinner7d00cc12014-03-17 23:08:06 +01007447 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7449 UnicodeDecodeError. */
7450 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7451 if (exc != NULL) {
7452 PyCodec_StrictErrors(exc);
7453 Py_CLEAR(exc);
7454 }
7455 goto error;
7456 }
7457
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007458 /* Extend a wchar_t* buffer */
7459 Py_ssize_t n = *bufsize; /* Get the current length */
7460 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7461 PyErr_NoMemory();
7462 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007464 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7465 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007467 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007468
7469 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 while (in < endin)
7471 {
7472 /* Decode a character */
7473 insize = 1;
7474 do
7475 {
7476 outsize = MultiByteToWideChar(code_page, flags,
7477 in, insize,
7478 buffer, Py_ARRAY_LENGTH(buffer));
7479 if (outsize > 0)
7480 break;
7481 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007482 if (err == ERROR_INVALID_FLAGS && flags) {
7483 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7484 flags = 0;
7485 continue;
7486 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 if (err != ERROR_NO_UNICODE_TRANSLATION
7488 && err != ERROR_INSUFFICIENT_BUFFER)
7489 {
7490 PyErr_SetFromWindowsErr(0);
7491 goto error;
7492 }
7493 insize++;
7494 }
7495 /* 4=maximum length of a UTF-8 sequence */
7496 while (insize <= 4 && (in + insize) <= endin);
7497
7498 if (outsize <= 0) {
7499 Py_ssize_t startinpos, endinpos, outpos;
7500
Victor Stinner7d00cc12014-03-17 23:08:06 +01007501 /* last character in partial decode? */
7502 if (in + insize >= endin && !final)
7503 break;
7504
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 startinpos = in - startin;
7506 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007507 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007508 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 errors, &errorHandler,
7510 encoding, reason,
7511 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007512 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 {
7514 goto error;
7515 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007516 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 }
7518 else {
7519 in += insize;
7520 memcpy(out, buffer, outsize * sizeof(wchar_t));
7521 out += outsize;
7522 }
7523 }
7524
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007525 /* Shrink the buffer */
7526 assert(out - *buf <= *bufsize);
7527 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007528 /* (in - startin) <= size and size is an int */
7529 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007530
7531error:
7532 Py_XDECREF(encoding_obj);
7533 Py_XDECREF(errorHandler);
7534 Py_XDECREF(exc);
7535 return ret;
7536}
7537
Victor Stinner3a50e702011-10-18 21:21:00 +02007538static PyObject *
7539decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007540 const char *s, Py_ssize_t size,
7541 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007542{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007543 wchar_t *buf = NULL;
7544 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007545 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 if (code_page < 0) {
7548 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7549 return NULL;
7550 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007551 if (size < 0) {
7552 PyErr_BadInternalCall();
7553 return NULL;
7554 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007555
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007556 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007558
Victor Stinner76a31a62011-11-04 00:05:13 +01007559 do
7560 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007561#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007562 if (size > DECODING_CHUNK_SIZE) {
7563 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007564 final = 0;
7565 done = 0;
7566 }
7567 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007569 {
7570 chunk_size = (int)size;
7571 final = (consumed == NULL);
7572 done = 1;
7573 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007574
Victor Stinner76a31a62011-11-04 00:05:13 +01007575 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007576 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007577 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007578 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007579 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007580
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007581 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007582 s, chunk_size);
7583 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007584 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007585 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007586 errors, final);
7587 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007588
7589 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007590 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007591 return NULL;
7592 }
7593
7594 if (consumed)
7595 *consumed += converted;
7596
7597 s += converted;
7598 size -= converted;
7599 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007600
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007601 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7602 PyMem_Free(buf);
7603 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007604}
7605
Alexander Belopolsky40018472011-02-26 01:02:56 +00007606PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007607PyUnicode_DecodeCodePageStateful(int code_page,
7608 const char *s,
7609 Py_ssize_t size,
7610 const char *errors,
7611 Py_ssize_t *consumed)
7612{
7613 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7614}
7615
7616PyObject *
7617PyUnicode_DecodeMBCSStateful(const char *s,
7618 Py_ssize_t size,
7619 const char *errors,
7620 Py_ssize_t *consumed)
7621{
7622 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7623}
7624
7625PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007626PyUnicode_DecodeMBCS(const char *s,
7627 Py_ssize_t size,
7628 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007629{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007630 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7631}
7632
Victor Stinner3a50e702011-10-18 21:21:00 +02007633static DWORD
7634encode_code_page_flags(UINT code_page, const char *errors)
7635{
7636 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007637 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 }
7639 else if (code_page == CP_UTF7) {
7640 /* CP_UTF7 only supports flags=0 */
7641 return 0;
7642 }
7643 else {
7644 if (errors != NULL && strcmp(errors, "replace") == 0)
7645 return 0;
7646 else
7647 return WC_NO_BEST_FIT_CHARS;
7648 }
7649}
7650
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007651/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 * Encode a Unicode string to a Windows code page into a byte string in strict
7653 * mode.
7654 *
7655 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007656 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007658static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007659encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007660 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007661 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007662{
Victor Stinner554f3f02010-06-16 23:33:54 +00007663 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 BOOL *pusedDefaultChar = &usedDefaultChar;
7665 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007666 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007667 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 const DWORD flags = encode_code_page_flags(code_page, NULL);
7669 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007670 /* Create a substring so that we can get the UTF-16 representation
7671 of just the slice under consideration. */
7672 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007673
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007675
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007677 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007678 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007679 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007680
Victor Stinner2fc507f2011-11-04 20:06:39 +01007681 substring = PyUnicode_Substring(unicode, offset, offset+len);
7682 if (substring == NULL)
7683 return -1;
7684 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7685 if (p == NULL) {
7686 Py_DECREF(substring);
7687 return -1;
7688 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007689 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007690
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007691 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007693 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007694 NULL, 0,
7695 NULL, pusedDefaultChar);
7696 if (outsize <= 0)
7697 goto error;
7698 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007699 if (pusedDefaultChar && *pusedDefaultChar) {
7700 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007701 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007702 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007703
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007707 if (*outbytes == NULL) {
7708 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007710 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007711 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007712 }
7713 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 const Py_ssize_t n = PyBytes_Size(*outbytes);
7716 if (outsize > PY_SSIZE_T_MAX - n) {
7717 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007718 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007720 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007721 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7722 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007723 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007724 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007725 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007726 }
7727
7728 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007730 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007731 out, outsize,
7732 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007733 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 if (outsize <= 0)
7735 goto error;
7736 if (pusedDefaultChar && *pusedDefaultChar)
7737 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007738 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007739
Victor Stinner3a50e702011-10-18 21:21:00 +02007740error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007741 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007742 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7743 return -2;
7744 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007745 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007746}
7747
Victor Stinner3a50e702011-10-18 21:21:00 +02007748/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007749 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007750 * error handler.
7751 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007752 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 * -1 on other error.
7754 */
7755static int
7756encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007757 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007759{
Victor Stinner3a50e702011-10-18 21:21:00 +02007760 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007761 Py_ssize_t pos = unicode_offset;
7762 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 /* Ideally, we should get reason from FormatMessage. This is the Windows
7764 2000 English version of the message. */
7765 const char *reason = "invalid character";
7766 /* 4=maximum length of a UTF-8 sequence */
7767 char buffer[4];
7768 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7769 Py_ssize_t outsize;
7770 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007771 PyObject *errorHandler = NULL;
7772 PyObject *exc = NULL;
7773 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007774 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007775 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007776 PyObject *rep;
7777 int ret = -1;
7778
7779 assert(insize > 0);
7780
7781 encoding = code_page_name(code_page, &encoding_obj);
7782 if (encoding == NULL)
7783 return -1;
7784
7785 if (errors == NULL || strcmp(errors, "strict") == 0) {
7786 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7787 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007788 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007789 if (exc != NULL) {
7790 PyCodec_StrictErrors(exc);
7791 Py_DECREF(exc);
7792 }
7793 Py_XDECREF(encoding_obj);
7794 return -1;
7795 }
7796
7797 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7798 pusedDefaultChar = &usedDefaultChar;
7799 else
7800 pusedDefaultChar = NULL;
7801
7802 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7803 PyErr_NoMemory();
7804 goto error;
7805 }
7806 outsize = insize * Py_ARRAY_LENGTH(buffer);
7807
7808 if (*outbytes == NULL) {
7809 /* Create string object */
7810 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7811 if (*outbytes == NULL)
7812 goto error;
7813 out = PyBytes_AS_STRING(*outbytes);
7814 }
7815 else {
7816 /* Extend string object */
7817 Py_ssize_t n = PyBytes_Size(*outbytes);
7818 if (n > PY_SSIZE_T_MAX - outsize) {
7819 PyErr_NoMemory();
7820 goto error;
7821 }
7822 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7823 goto error;
7824 out = PyBytes_AS_STRING(*outbytes) + n;
7825 }
7826
7827 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007828 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007829 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007830 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7831 wchar_t chars[2];
7832 int charsize;
7833 if (ch < 0x10000) {
7834 chars[0] = (wchar_t)ch;
7835 charsize = 1;
7836 }
7837 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007838 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7839 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007840 charsize = 2;
7841 }
7842
Victor Stinner3a50e702011-10-18 21:21:00 +02007843 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007844 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007845 buffer, Py_ARRAY_LENGTH(buffer),
7846 NULL, pusedDefaultChar);
7847 if (outsize > 0) {
7848 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7849 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007850 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007851 memcpy(out, buffer, outsize);
7852 out += outsize;
7853 continue;
7854 }
7855 }
7856 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7857 PyErr_SetFromWindowsErr(0);
7858 goto error;
7859 }
7860
Victor Stinner3a50e702011-10-18 21:21:00 +02007861 rep = unicode_encode_call_errorhandler(
7862 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007863 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007864 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007865 if (rep == NULL)
7866 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007867 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007868
7869 if (PyBytes_Check(rep)) {
7870 outsize = PyBytes_GET_SIZE(rep);
7871 if (outsize != 1) {
7872 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7873 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7874 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7875 Py_DECREF(rep);
7876 goto error;
7877 }
7878 out = PyBytes_AS_STRING(*outbytes) + offset;
7879 }
7880 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7881 out += outsize;
7882 }
7883 else {
7884 Py_ssize_t i;
7885 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007886 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007887
Benjamin Petersonbac79492012-01-14 13:34:47 -05007888 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007889 Py_DECREF(rep);
7890 goto error;
7891 }
7892
7893 outsize = PyUnicode_GET_LENGTH(rep);
7894 if (outsize != 1) {
7895 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7896 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7897 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7898 Py_DECREF(rep);
7899 goto error;
7900 }
7901 out = PyBytes_AS_STRING(*outbytes) + offset;
7902 }
7903 kind = PyUnicode_KIND(rep);
7904 data = PyUnicode_DATA(rep);
7905 for (i=0; i < outsize; i++) {
7906 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7907 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007908 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007909 encoding, unicode,
7910 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007911 "unable to encode error handler result to ASCII");
7912 Py_DECREF(rep);
7913 goto error;
7914 }
7915 *out = (unsigned char)ch;
7916 out++;
7917 }
7918 }
7919 Py_DECREF(rep);
7920 }
7921 /* write a NUL byte */
7922 *out = 0;
7923 outsize = out - PyBytes_AS_STRING(*outbytes);
7924 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7925 if (_PyBytes_Resize(outbytes, outsize) < 0)
7926 goto error;
7927 ret = 0;
7928
7929error:
7930 Py_XDECREF(encoding_obj);
7931 Py_XDECREF(errorHandler);
7932 Py_XDECREF(exc);
7933 return ret;
7934}
7935
Victor Stinner3a50e702011-10-18 21:21:00 +02007936static PyObject *
7937encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007938 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007939 const char *errors)
7940{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007941 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007942 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007943 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007944 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007945
Victor Stinner29dacf22015-01-26 16:41:32 +01007946 if (!PyUnicode_Check(unicode)) {
7947 PyErr_BadArgument();
7948 return NULL;
7949 }
7950
Benjamin Petersonbac79492012-01-14 13:34:47 -05007951 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007952 return NULL;
7953 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007954
Victor Stinner3a50e702011-10-18 21:21:00 +02007955 if (code_page < 0) {
7956 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7957 return NULL;
7958 }
7959
Martin v. Löwis3d325192011-11-04 18:23:06 +01007960 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007961 return PyBytes_FromStringAndSize(NULL, 0);
7962
Victor Stinner7581cef2011-11-03 22:32:33 +01007963 offset = 0;
7964 do
7965 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007966#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007967 if (len > DECODING_CHUNK_SIZE) {
7968 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007969 done = 0;
7970 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007971 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007972#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007973 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007974 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007975 done = 1;
7976 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007977
Victor Stinner76a31a62011-11-04 00:05:13 +01007978 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007979 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007980 errors);
7981 if (ret == -2)
7982 ret = encode_code_page_errors(code_page, &outbytes,
7983 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007984 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007985 if (ret < 0) {
7986 Py_XDECREF(outbytes);
7987 return NULL;
7988 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007989
Victor Stinner7581cef2011-11-03 22:32:33 +01007990 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007991 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007992 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007993
Victor Stinner3a50e702011-10-18 21:21:00 +02007994 return outbytes;
7995}
7996
7997PyObject *
7998PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7999 Py_ssize_t size,
8000 const char *errors)
8001{
Victor Stinner7581cef2011-11-03 22:32:33 +01008002 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008003 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008004 if (unicode == NULL)
8005 return NULL;
8006 res = encode_code_page(CP_ACP, unicode, errors);
8007 Py_DECREF(unicode);
8008 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008009}
8010
8011PyObject *
8012PyUnicode_EncodeCodePage(int code_page,
8013 PyObject *unicode,
8014 const char *errors)
8015{
Victor Stinner7581cef2011-11-03 22:32:33 +01008016 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008017}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008018
Alexander Belopolsky40018472011-02-26 01:02:56 +00008019PyObject *
8020PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008021{
Victor Stinner7581cef2011-11-03 22:32:33 +01008022 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008023}
8024
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008025#undef NEED_RETRY
8026
Steve Dowercc16be82016-09-08 10:35:16 -07008027#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008028
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029/* --- Character Mapping Codec -------------------------------------------- */
8030
Victor Stinnerfb161b12013-04-18 01:44:27 +02008031static int
8032charmap_decode_string(const char *s,
8033 Py_ssize_t size,
8034 PyObject *mapping,
8035 const char *errors,
8036 _PyUnicodeWriter *writer)
8037{
8038 const char *starts = s;
8039 const char *e;
8040 Py_ssize_t startinpos, endinpos;
8041 PyObject *errorHandler = NULL, *exc = NULL;
8042 Py_ssize_t maplen;
8043 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008044 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008045 Py_UCS4 x;
8046 unsigned char ch;
8047
8048 if (PyUnicode_READY(mapping) == -1)
8049 return -1;
8050
8051 maplen = PyUnicode_GET_LENGTH(mapping);
8052 mapdata = PyUnicode_DATA(mapping);
8053 mapkind = PyUnicode_KIND(mapping);
8054
8055 e = s + size;
8056
8057 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8058 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8059 * is disabled in encoding aliases, latin1 is preferred because
8060 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008061 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008062 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8063 Py_UCS4 maxchar = writer->maxchar;
8064
8065 assert (writer->kind == PyUnicode_1BYTE_KIND);
8066 while (s < e) {
8067 ch = *s;
8068 x = mapdata_ucs1[ch];
8069 if (x > maxchar) {
8070 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8071 goto onError;
8072 maxchar = writer->maxchar;
8073 outdata = (Py_UCS1 *)writer->data;
8074 }
8075 outdata[writer->pos] = x;
8076 writer->pos++;
8077 ++s;
8078 }
8079 return 0;
8080 }
8081
8082 while (s < e) {
8083 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8084 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008085 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008086 if (outkind == PyUnicode_1BYTE_KIND) {
8087 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8088 Py_UCS4 maxchar = writer->maxchar;
8089 while (s < e) {
8090 ch = *s;
8091 x = mapdata_ucs2[ch];
8092 if (x > maxchar)
8093 goto Error;
8094 outdata[writer->pos] = x;
8095 writer->pos++;
8096 ++s;
8097 }
8098 break;
8099 }
8100 else if (outkind == PyUnicode_2BYTE_KIND) {
8101 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8102 while (s < e) {
8103 ch = *s;
8104 x = mapdata_ucs2[ch];
8105 if (x == 0xFFFE)
8106 goto Error;
8107 outdata[writer->pos] = x;
8108 writer->pos++;
8109 ++s;
8110 }
8111 break;
8112 }
8113 }
8114 ch = *s;
8115
8116 if (ch < maplen)
8117 x = PyUnicode_READ(mapkind, mapdata, ch);
8118 else
8119 x = 0xfffe; /* invalid value */
8120Error:
8121 if (x == 0xfffe)
8122 {
8123 /* undefined mapping */
8124 startinpos = s-starts;
8125 endinpos = startinpos+1;
8126 if (unicode_decode_call_errorhandler_writer(
8127 errors, &errorHandler,
8128 "charmap", "character maps to <undefined>",
8129 &starts, &e, &startinpos, &endinpos, &exc, &s,
8130 writer)) {
8131 goto onError;
8132 }
8133 continue;
8134 }
8135
8136 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8137 goto onError;
8138 ++s;
8139 }
8140 Py_XDECREF(errorHandler);
8141 Py_XDECREF(exc);
8142 return 0;
8143
8144onError:
8145 Py_XDECREF(errorHandler);
8146 Py_XDECREF(exc);
8147 return -1;
8148}
8149
8150static int
8151charmap_decode_mapping(const char *s,
8152 Py_ssize_t size,
8153 PyObject *mapping,
8154 const char *errors,
8155 _PyUnicodeWriter *writer)
8156{
8157 const char *starts = s;
8158 const char *e;
8159 Py_ssize_t startinpos, endinpos;
8160 PyObject *errorHandler = NULL, *exc = NULL;
8161 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008162 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008163
8164 e = s + size;
8165
8166 while (s < e) {
8167 ch = *s;
8168
8169 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8170 key = PyLong_FromLong((long)ch);
8171 if (key == NULL)
8172 goto onError;
8173
8174 item = PyObject_GetItem(mapping, key);
8175 Py_DECREF(key);
8176 if (item == NULL) {
8177 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8178 /* No mapping found means: mapping is undefined. */
8179 PyErr_Clear();
8180 goto Undefined;
8181 } else
8182 goto onError;
8183 }
8184
8185 /* Apply mapping */
8186 if (item == Py_None)
8187 goto Undefined;
8188 if (PyLong_Check(item)) {
8189 long value = PyLong_AS_LONG(item);
8190 if (value == 0xFFFE)
8191 goto Undefined;
8192 if (value < 0 || value > MAX_UNICODE) {
8193 PyErr_Format(PyExc_TypeError,
8194 "character mapping must be in range(0x%lx)",
8195 (unsigned long)MAX_UNICODE + 1);
8196 goto onError;
8197 }
8198
8199 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8200 goto onError;
8201 }
8202 else if (PyUnicode_Check(item)) {
8203 if (PyUnicode_READY(item) == -1)
8204 goto onError;
8205 if (PyUnicode_GET_LENGTH(item) == 1) {
8206 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8207 if (value == 0xFFFE)
8208 goto Undefined;
8209 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8210 goto onError;
8211 }
8212 else {
8213 writer->overallocate = 1;
8214 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8215 goto onError;
8216 }
8217 }
8218 else {
8219 /* wrong return value */
8220 PyErr_SetString(PyExc_TypeError,
8221 "character mapping must return integer, None or str");
8222 goto onError;
8223 }
8224 Py_CLEAR(item);
8225 ++s;
8226 continue;
8227
8228Undefined:
8229 /* undefined mapping */
8230 Py_CLEAR(item);
8231 startinpos = s-starts;
8232 endinpos = startinpos+1;
8233 if (unicode_decode_call_errorhandler_writer(
8234 errors, &errorHandler,
8235 "charmap", "character maps to <undefined>",
8236 &starts, &e, &startinpos, &endinpos, &exc, &s,
8237 writer)) {
8238 goto onError;
8239 }
8240 }
8241 Py_XDECREF(errorHandler);
8242 Py_XDECREF(exc);
8243 return 0;
8244
8245onError:
8246 Py_XDECREF(item);
8247 Py_XDECREF(errorHandler);
8248 Py_XDECREF(exc);
8249 return -1;
8250}
8251
Alexander Belopolsky40018472011-02-26 01:02:56 +00008252PyObject *
8253PyUnicode_DecodeCharmap(const char *s,
8254 Py_ssize_t size,
8255 PyObject *mapping,
8256 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008258 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008259
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 /* Default to Latin-1 */
8261 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008265 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008266 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008267 writer.min_length = size;
8268 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008270
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008271 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008272 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8273 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008274 }
8275 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008276 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8277 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008279 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008280
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008282 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 return NULL;
8284}
8285
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008286/* Charmap encoding: the lookup table */
8287
Alexander Belopolsky40018472011-02-26 01:02:56 +00008288struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 PyObject_HEAD
8290 unsigned char level1[32];
8291 int count2, count3;
8292 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008293};
8294
8295static PyObject*
8296encoding_map_size(PyObject *obj, PyObject* args)
8297{
8298 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008299 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008301}
8302
8303static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008304 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 PyDoc_STR("Return the size (in bytes) of this object") },
8306 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307};
8308
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 "EncodingMap", /*tp_name*/
8312 sizeof(struct encoding_map), /*tp_basicsize*/
8313 0, /*tp_itemsize*/
8314 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008315 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008316 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 0, /*tp_getattr*/
8318 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008319 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 0, /*tp_repr*/
8321 0, /*tp_as_number*/
8322 0, /*tp_as_sequence*/
8323 0, /*tp_as_mapping*/
8324 0, /*tp_hash*/
8325 0, /*tp_call*/
8326 0, /*tp_str*/
8327 0, /*tp_getattro*/
8328 0, /*tp_setattro*/
8329 0, /*tp_as_buffer*/
8330 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8331 0, /*tp_doc*/
8332 0, /*tp_traverse*/
8333 0, /*tp_clear*/
8334 0, /*tp_richcompare*/
8335 0, /*tp_weaklistoffset*/
8336 0, /*tp_iter*/
8337 0, /*tp_iternext*/
8338 encoding_map_methods, /*tp_methods*/
8339 0, /*tp_members*/
8340 0, /*tp_getset*/
8341 0, /*tp_base*/
8342 0, /*tp_dict*/
8343 0, /*tp_descr_get*/
8344 0, /*tp_descr_set*/
8345 0, /*tp_dictoffset*/
8346 0, /*tp_init*/
8347 0, /*tp_alloc*/
8348 0, /*tp_new*/
8349 0, /*tp_free*/
8350 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008351};
8352
8353PyObject*
8354PyUnicode_BuildEncodingMap(PyObject* string)
8355{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 PyObject *result;
8357 struct encoding_map *mresult;
8358 int i;
8359 int need_dict = 0;
8360 unsigned char level1[32];
8361 unsigned char level2[512];
8362 unsigned char *mlevel1, *mlevel2, *mlevel3;
8363 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008365 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008366 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008369 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 PyErr_BadArgument();
8371 return NULL;
8372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 kind = PyUnicode_KIND(string);
8374 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008375 length = PyUnicode_GET_LENGTH(string);
8376 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377 memset(level1, 0xFF, sizeof level1);
8378 memset(level2, 0xFF, sizeof level2);
8379
8380 /* If there isn't a one-to-one mapping of NULL to \0,
8381 or if there are non-BMP characters, we need to use
8382 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008385 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008386 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 ch = PyUnicode_READ(kind, data, i);
8388 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008389 need_dict = 1;
8390 break;
8391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008393 /* unmapped character */
8394 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 l1 = ch >> 11;
8396 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008397 if (level1[l1] == 0xFF)
8398 level1[l1] = count2++;
8399 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008400 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008401 }
8402
8403 if (count2 >= 0xFF || count3 >= 0xFF)
8404 need_dict = 1;
8405
8406 if (need_dict) {
8407 PyObject *result = PyDict_New();
8408 PyObject *key, *value;
8409 if (!result)
8410 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008411 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008413 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414 if (!key || !value)
8415 goto failed1;
8416 if (PyDict_SetItem(result, key, value) == -1)
8417 goto failed1;
8418 Py_DECREF(key);
8419 Py_DECREF(value);
8420 }
8421 return result;
8422 failed1:
8423 Py_XDECREF(key);
8424 Py_XDECREF(value);
8425 Py_DECREF(result);
8426 return NULL;
8427 }
8428
8429 /* Create a three-level trie */
8430 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8431 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008432 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008433 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008434 }
8435
8436 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008437 mresult = (struct encoding_map*)result;
8438 mresult->count2 = count2;
8439 mresult->count3 = count3;
8440 mlevel1 = mresult->level1;
8441 mlevel2 = mresult->level23;
8442 mlevel3 = mresult->level23 + 16*count2;
8443 memcpy(mlevel1, level1, 32);
8444 memset(mlevel2, 0xFF, 16*count2);
8445 memset(mlevel3, 0, 128*count3);
8446 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008447 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008448 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008449 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8450 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008451 /* unmapped character */
8452 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008453 o1 = ch>>11;
8454 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 i2 = 16*mlevel1[o1] + o2;
8456 if (mlevel2[i2] == 0xFF)
8457 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008458 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008459 i3 = 128*mlevel2[i2] + o3;
8460 mlevel3[i3] = i;
8461 }
8462 return result;
8463}
8464
8465static int
Victor Stinner22168992011-11-20 17:09:18 +01008466encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008467{
8468 struct encoding_map *map = (struct encoding_map*)mapping;
8469 int l1 = c>>11;
8470 int l2 = (c>>7) & 0xF;
8471 int l3 = c & 0x7F;
8472 int i;
8473
Victor Stinner22168992011-11-20 17:09:18 +01008474 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008476 if (c == 0)
8477 return 0;
8478 /* level 1*/
8479 i = map->level1[l1];
8480 if (i == 0xFF) {
8481 return -1;
8482 }
8483 /* level 2*/
8484 i = map->level23[16*i+l2];
8485 if (i == 0xFF) {
8486 return -1;
8487 }
8488 /* level 3 */
8489 i = map->level23[16*map->count2 + 128*i + l3];
8490 if (i == 0) {
8491 return -1;
8492 }
8493 return i;
8494}
8495
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496/* Lookup the character ch in the mapping. If the character
8497 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008498 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008499static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008500charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501{
Christian Heimes217cfd12007-12-02 14:31:20 +00008502 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503 PyObject *x;
8504
8505 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 x = PyObject_GetItem(mapping, w);
8508 Py_DECREF(w);
8509 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8511 /* No mapping found means: mapping is undefined. */
8512 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008513 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 } else
8515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008517 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008519 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 long value = PyLong_AS_LONG(x);
8521 if (value < 0 || value > 255) {
8522 PyErr_SetString(PyExc_TypeError,
8523 "character mapping must be in range(256)");
8524 Py_DECREF(x);
8525 return NULL;
8526 }
8527 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008529 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 /* wrong return value */
8533 PyErr_Format(PyExc_TypeError,
8534 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008535 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 Py_DECREF(x);
8537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 }
8539}
8540
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008541static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008542charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008543{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8545 /* exponentially overallocate to minimize reallocations */
8546 if (requiredsize < 2*outsize)
8547 requiredsize = 2*outsize;
8548 if (_PyBytes_Resize(outobj, requiredsize))
8549 return -1;
8550 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008551}
8552
Benjamin Peterson14339b62009-01-31 16:36:08 +00008553typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008555} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008557 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 space is available. Return a new reference to the object that
8559 was put in the output buffer, or Py_None, if the mapping was undefined
8560 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008561 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008562static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008563charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008564 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008566 PyObject *rep;
8567 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008568 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569
Andy Lesterdffe4c02020-03-04 07:15:20 -06008570 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008571 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008573 if (res == -1)
8574 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 if (outsize<requiredsize)
8576 if (charmapencode_resize(outobj, outpos, requiredsize))
8577 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008578 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 outstart[(*outpos)++] = (char)res;
8580 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008581 }
8582
8583 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008586 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 Py_DECREF(rep);
8588 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008589 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 if (PyLong_Check(rep)) {
8591 Py_ssize_t requiredsize = *outpos+1;
8592 if (outsize<requiredsize)
8593 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8594 Py_DECREF(rep);
8595 return enc_EXCEPTION;
8596 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008597 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008599 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008600 else {
8601 const char *repchars = PyBytes_AS_STRING(rep);
8602 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8603 Py_ssize_t requiredsize = *outpos+repsize;
8604 if (outsize<requiredsize)
8605 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8606 Py_DECREF(rep);
8607 return enc_EXCEPTION;
8608 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008609 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 memcpy(outstart + *outpos, repchars, repsize);
8611 *outpos += repsize;
8612 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008614 Py_DECREF(rep);
8615 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616}
8617
8618/* handle an error in PyUnicode_EncodeCharmap
8619 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620static int
8621charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008622 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008624 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008625 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626{
8627 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008628 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008629 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008630 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008631 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008632 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008634 Py_ssize_t collstartpos = *inpos;
8635 Py_ssize_t collendpos = *inpos+1;
8636 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008637 const char *encoding = "charmap";
8638 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008639 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008640 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008641 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642
Benjamin Petersonbac79492012-01-14 13:34:47 -05008643 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008644 return -1;
8645 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 /* find all unencodable characters */
8647 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008648 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008649 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008650 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008651 val = encoding_map_lookup(ch, mapping);
8652 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 break;
8654 ++collendpos;
8655 continue;
8656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008657
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008658 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8659 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 if (rep==NULL)
8661 return -1;
8662 else if (rep!=Py_None) {
8663 Py_DECREF(rep);
8664 break;
8665 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008666 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 }
8669 /* cache callback name lookup
8670 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008671 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008672 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008673
8674 switch (*error_handler) {
8675 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008676 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008677 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008678
8679 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008680 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 x = charmapencode_output('?', mapping, res, respos);
8682 if (x==enc_EXCEPTION) {
8683 return -1;
8684 }
8685 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008686 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 return -1;
8688 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008689 }
8690 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008691 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008692 *inpos = collendpos;
8693 break;
Victor Stinner50149202015-09-22 00:26:54 +02008694
8695 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008696 /* generate replacement (temporarily (mis)uses p) */
8697 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 char buffer[2+29+1+1];
8699 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008700 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 for (cp = buffer; *cp; ++cp) {
8702 x = charmapencode_output(*cp, mapping, res, respos);
8703 if (x==enc_EXCEPTION)
8704 return -1;
8705 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008706 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 return -1;
8708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008709 }
8710 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008711 *inpos = collendpos;
8712 break;
Victor Stinner50149202015-09-22 00:26:54 +02008713
Benjamin Peterson14339b62009-01-31 16:36:08 +00008714 default:
Victor Stinner50149202015-09-22 00:26:54 +02008715 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008716 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008720 if (PyBytes_Check(repunicode)) {
8721 /* Directly copy bytes result to output. */
8722 Py_ssize_t outsize = PyBytes_Size(*res);
8723 Py_ssize_t requiredsize;
8724 repsize = PyBytes_Size(repunicode);
8725 requiredsize = *respos + repsize;
8726 if (requiredsize > outsize)
8727 /* Make room for all additional bytes. */
8728 if (charmapencode_resize(res, respos, requiredsize)) {
8729 Py_DECREF(repunicode);
8730 return -1;
8731 }
8732 memcpy(PyBytes_AsString(*res) + *respos,
8733 PyBytes_AsString(repunicode), repsize);
8734 *respos += repsize;
8735 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008736 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008737 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008738 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008739 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008740 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008741 Py_DECREF(repunicode);
8742 return -1;
8743 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008744 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008745 data = PyUnicode_DATA(repunicode);
8746 kind = PyUnicode_KIND(repunicode);
8747 for (index = 0; index < repsize; index++) {
8748 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8749 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008751 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -1;
8753 }
8754 else if (x==enc_FAILED) {
8755 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008756 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 return -1;
8758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008759 }
8760 *inpos = newpos;
8761 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762 }
8763 return 0;
8764}
8765
Alexander Belopolsky40018472011-02-26 01:02:56 +00008766PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008767_PyUnicode_EncodeCharmap(PyObject *unicode,
8768 PyObject *mapping,
8769 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008771 /* output object */
8772 PyObject *res = NULL;
8773 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008774 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008775 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008777 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008778 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008780 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008781 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008782 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008783
Benjamin Petersonbac79492012-01-14 13:34:47 -05008784 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008785 return NULL;
8786 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008787 data = PyUnicode_DATA(unicode);
8788 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008789
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790 /* Default to Latin-1 */
8791 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008792 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008794 /* allocate enough for a simple encoding without
8795 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008796 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008797 if (res == NULL)
8798 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008799 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008803 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008805 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 if (x==enc_EXCEPTION) /* error */
8807 goto onError;
8808 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008809 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008811 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 &res, &respos)) {
8813 goto onError;
8814 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008815 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 else
8817 /* done with this character => adjust input position */
8818 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008822 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008823 if (_PyBytes_Resize(&res, respos) < 0)
8824 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008827 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 return res;
8829
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008831 Py_XDECREF(res);
8832 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008833 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008834 return NULL;
8835}
8836
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008837/* Deprecated */
8838PyObject *
8839PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8840 Py_ssize_t size,
8841 PyObject *mapping,
8842 const char *errors)
8843{
8844 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008845 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008846 if (unicode == NULL)
8847 return NULL;
8848 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8849 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008850 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008851}
8852
Alexander Belopolsky40018472011-02-26 01:02:56 +00008853PyObject *
8854PyUnicode_AsCharmapString(PyObject *unicode,
8855 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856{
8857 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 PyErr_BadArgument();
8859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008861 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862}
8863
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008864/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008865static void
8866make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008868 Py_ssize_t startpos, Py_ssize_t endpos,
8869 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 *exceptionObject = _PyUnicodeTranslateError_Create(
8873 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 }
8875 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8877 goto onError;
8878 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8879 goto onError;
8880 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8881 goto onError;
8882 return;
8883 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008884 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 }
8886}
8887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008888/* error handling callback helper:
8889 build arguments, call the callback and check the arguments,
8890 put the result into newpos and return the replacement string, which
8891 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008892static PyObject *
8893unicode_translate_call_errorhandler(const char *errors,
8894 PyObject **errorHandler,
8895 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008897 Py_ssize_t startpos, Py_ssize_t endpos,
8898 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008899{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008900 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008901
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008902 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008903 PyObject *restuple;
8904 PyObject *resunicode;
8905
8906 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910 }
8911
8912 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008914 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008916
Petr Viktorinffd97532020-02-11 17:46:57 +01008917 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008920 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008921 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 Py_DECREF(restuple);
8923 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008924 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008925 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 &resunicode, &i_newpos)) {
8927 Py_DECREF(restuple);
8928 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008929 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008930 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008932 else
8933 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008935 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 Py_DECREF(restuple);
8937 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008938 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008939 Py_INCREF(resunicode);
8940 Py_DECREF(restuple);
8941 return resunicode;
8942}
8943
8944/* Lookup the character ch in the mapping and put the result in result,
8945 which must be decrefed by the caller.
8946 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008947static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008949{
Christian Heimes217cfd12007-12-02 14:31:20 +00008950 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008951 PyObject *x;
8952
8953 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955 x = PyObject_GetItem(mapping, w);
8956 Py_DECREF(w);
8957 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8959 /* No mapping found means: use 1:1 mapping. */
8960 PyErr_Clear();
8961 *result = NULL;
8962 return 0;
8963 } else
8964 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965 }
8966 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 *result = x;
8968 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008969 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008970 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008972 if (value < 0 || value > MAX_UNICODE) {
8973 PyErr_Format(PyExc_ValueError,
8974 "character mapping must be in range(0x%x)",
8975 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 Py_DECREF(x);
8977 return -1;
8978 }
8979 *result = x;
8980 return 0;
8981 }
8982 else if (PyUnicode_Check(x)) {
8983 *result = x;
8984 return 0;
8985 }
8986 else {
8987 /* wrong return value */
8988 PyErr_SetString(PyExc_TypeError,
8989 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008990 Py_DECREF(x);
8991 return -1;
8992 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008993}
Victor Stinner1194ea02014-04-04 19:37:40 +02008994
8995/* lookup the character, write the result into the writer.
8996 Return 1 if the result was written into the writer, return 0 if the mapping
8997 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008998static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008999charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9000 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009001{
Victor Stinner1194ea02014-04-04 19:37:40 +02009002 PyObject *item;
9003
9004 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009006
9007 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009013 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009014
9015 if (item == Py_None) {
9016 Py_DECREF(item);
9017 return 0;
9018 }
9019
9020 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009021 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9022 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9023 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9025 Py_DECREF(item);
9026 return -1;
9027 }
9028 Py_DECREF(item);
9029 return 1;
9030 }
9031
9032 if (!PyUnicode_Check(item)) {
9033 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 }
9036
9037 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9038 Py_DECREF(item);
9039 return -1;
9040 }
9041
9042 Py_DECREF(item);
9043 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009044}
9045
Victor Stinner89a76ab2014-04-05 11:44:04 +02009046static int
9047unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9048 Py_UCS1 *translate)
9049{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009050 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009051 int ret = 0;
9052
Victor Stinner89a76ab2014-04-05 11:44:04 +02009053 if (charmaptranslate_lookup(ch, mapping, &item)) {
9054 return -1;
9055 }
9056
9057 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009058 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009059 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009060 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009061 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009062 /* not found => default to 1:1 mapping */
9063 translate[ch] = ch;
9064 return 1;
9065 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009066 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009067 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009068 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9069 used it */
9070 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009071 /* invalid character or character outside ASCII:
9072 skip the fast translate */
9073 goto exit;
9074 }
9075 translate[ch] = (Py_UCS1)replace;
9076 }
9077 else if (PyUnicode_Check(item)) {
9078 Py_UCS4 replace;
9079
9080 if (PyUnicode_READY(item) == -1) {
9081 Py_DECREF(item);
9082 return -1;
9083 }
9084 if (PyUnicode_GET_LENGTH(item) != 1)
9085 goto exit;
9086
9087 replace = PyUnicode_READ_CHAR(item, 0);
9088 if (replace > 127)
9089 goto exit;
9090 translate[ch] = (Py_UCS1)replace;
9091 }
9092 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009093 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009094 goto exit;
9095 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009096 ret = 1;
9097
Benjamin Peterson1365de72014-04-07 20:15:41 -04009098 exit:
9099 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009100 return ret;
9101}
9102
9103/* Fast path for ascii => ascii translation. Return 1 if the whole string
9104 was translated into writer, return 0 if the input string was partially
9105 translated into writer, raise an exception and return -1 on error. */
9106static int
9107unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009108 _PyUnicodeWriter *writer, int ignore,
9109 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009110{
Victor Stinner872b2912014-04-05 14:27:07 +02009111 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009112 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009113 const Py_UCS1 *in, *end;
9114 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009115 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009116
Victor Stinner89a76ab2014-04-05 11:44:04 +02009117 len = PyUnicode_GET_LENGTH(input);
9118
Victor Stinner872b2912014-04-05 14:27:07 +02009119 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009120
9121 in = PyUnicode_1BYTE_DATA(input);
9122 end = in + len;
9123
9124 assert(PyUnicode_IS_ASCII(writer->buffer));
9125 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9126 out = PyUnicode_1BYTE_DATA(writer->buffer);
9127
Victor Stinner872b2912014-04-05 14:27:07 +02009128 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009129 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009130 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009131 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009132 int translate = unicode_fast_translate_lookup(mapping, ch,
9133 ascii_table);
9134 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009135 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009136 if (translate == 0)
9137 goto exit;
9138 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009139 }
Victor Stinner872b2912014-04-05 14:27:07 +02009140 if (ch2 == 0xfe) {
9141 if (ignore)
9142 continue;
9143 goto exit;
9144 }
9145 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009146 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009147 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009148 }
Victor Stinner872b2912014-04-05 14:27:07 +02009149 res = 1;
9150
9151exit:
9152 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009153 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009154 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009155}
9156
Victor Stinner3222da22015-10-01 22:07:32 +02009157static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158_PyUnicode_TranslateCharmap(PyObject *input,
9159 PyObject *mapping,
9160 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009163 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 Py_ssize_t size, i;
9165 int kind;
9166 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009167 _PyUnicodeWriter writer;
9168 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009169 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009170 PyObject *errorHandler = NULL;
9171 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009172 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009173 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009174
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 PyErr_BadArgument();
9177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 if (PyUnicode_READY(input) == -1)
9181 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009182 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 kind = PyUnicode_KIND(input);
9184 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009186 if (size == 0)
9187 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009189 /* allocate enough for a simple 1:1 translation without
9190 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009191 _PyUnicodeWriter_Init(&writer);
9192 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194
Victor Stinner872b2912014-04-05 14:27:07 +02009195 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9196
Victor Stinner33798672016-03-01 21:59:58 +01009197 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009198 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009199 if (PyUnicode_IS_ASCII(input)) {
9200 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9201 if (res < 0) {
9202 _PyUnicodeWriter_Dealloc(&writer);
9203 return NULL;
9204 }
9205 if (res == 1)
9206 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009207 }
Victor Stinner33798672016-03-01 21:59:58 +01009208 else {
9209 i = 0;
9210 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009214 int translate;
9215 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9216 Py_ssize_t newpos;
9217 /* startpos for collecting untranslatable chars */
9218 Py_ssize_t collstart;
9219 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009220 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221
Victor Stinner1194ea02014-04-04 19:37:40 +02009222 ch = PyUnicode_READ(kind, data, i);
9223 translate = charmaptranslate_output(ch, mapping, &writer);
9224 if (translate < 0)
9225 goto onError;
9226
9227 if (translate != 0) {
9228 /* it worked => adjust input pointer */
9229 ++i;
9230 continue;
9231 }
9232
9233 /* untranslatable character */
9234 collstart = i;
9235 collend = i+1;
9236
9237 /* find all untranslatable characters */
9238 while (collend < size) {
9239 PyObject *x;
9240 ch = PyUnicode_READ(kind, data, collend);
9241 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009242 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009243 Py_XDECREF(x);
9244 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009246 ++collend;
9247 }
9248
9249 if (ignore) {
9250 i = collend;
9251 }
9252 else {
9253 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9254 reason, input, &exc,
9255 collstart, collend, &newpos);
9256 if (repunicode == NULL)
9257 goto onError;
9258 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009259 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009260 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009261 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009262 Py_DECREF(repunicode);
9263 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009264 }
9265 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009266 Py_XDECREF(exc);
9267 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009268 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269
Benjamin Peterson29060642009-01-31 22:14:21 +00009270 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009271 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009272 Py_XDECREF(exc);
9273 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274 return NULL;
9275}
9276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277/* Deprecated. Use PyUnicode_Translate instead. */
9278PyObject *
9279PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9280 Py_ssize_t size,
9281 PyObject *mapping,
9282 const char *errors)
9283{
Christian Heimes5f520f42012-09-11 14:03:25 +02009284 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009285 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 if (!unicode)
9287 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009288 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9289 Py_DECREF(unicode);
9290 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291}
9292
Alexander Belopolsky40018472011-02-26 01:02:56 +00009293PyObject *
9294PyUnicode_Translate(PyObject *str,
9295 PyObject *mapping,
9296 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009298 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009299 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009300 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301}
Tim Petersced69f82003-09-16 20:30:58 +00009302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303PyObject *
9304_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9305{
9306 if (!PyUnicode_Check(unicode)) {
9307 PyErr_BadInternalCall();
9308 return NULL;
9309 }
9310 if (PyUnicode_READY(unicode) == -1)
9311 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009312 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 /* If the string is already ASCII, just return the same string */
9314 Py_INCREF(unicode);
9315 return unicode;
9316 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009317
9318 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9319 PyObject *result = PyUnicode_New(len, 127);
9320 if (result == NULL) {
9321 return NULL;
9322 }
9323
9324 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9325 int kind = PyUnicode_KIND(unicode);
9326 const void *data = PyUnicode_DATA(unicode);
9327 Py_ssize_t i;
9328 for (i = 0; i < len; ++i) {
9329 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9330 if (ch < 127) {
9331 out[i] = ch;
9332 }
9333 else if (Py_UNICODE_ISSPACE(ch)) {
9334 out[i] = ' ';
9335 }
9336 else {
9337 int decimal = Py_UNICODE_TODECIMAL(ch);
9338 if (decimal < 0) {
9339 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009340 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009341 _PyUnicode_LENGTH(result) = i + 1;
9342 break;
9343 }
9344 out[i] = '0' + decimal;
9345 }
9346 }
9347
INADA Naoki16dfca42018-07-14 12:06:43 +09009348 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009349 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350}
9351
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009352PyObject *
9353PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9354 Py_ssize_t length)
9355{
Victor Stinnerf0124502011-11-21 23:12:56 +01009356 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009357 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009358 Py_UCS4 maxchar;
9359 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009360 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009361
Victor Stinner99d7ad02012-02-22 13:37:39 +01009362 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009363 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009364 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009365 if (ch > 127) {
9366 int decimal = Py_UNICODE_TODECIMAL(ch);
9367 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009368 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009369 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009370 }
9371 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009372
9373 /* Copy to a new string */
9374 decimal = PyUnicode_New(length, maxchar);
9375 if (decimal == NULL)
9376 return decimal;
9377 kind = PyUnicode_KIND(decimal);
9378 data = PyUnicode_DATA(decimal);
9379 /* Iterate over code points */
9380 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009381 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009382 if (ch > 127) {
9383 int decimal = Py_UNICODE_TODECIMAL(ch);
9384 if (decimal >= 0)
9385 ch = '0' + decimal;
9386 }
9387 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009389 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009390}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009391/* --- Decimal Encoder ---------------------------------------------------- */
9392
Alexander Belopolsky40018472011-02-26 01:02:56 +00009393int
9394PyUnicode_EncodeDecimal(Py_UNICODE *s,
9395 Py_ssize_t length,
9396 char *output,
9397 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009398{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009399 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009400 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009401 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009402 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009403
9404 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 PyErr_BadArgument();
9406 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009407 }
9408
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009409 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009410 if (unicode == NULL)
9411 return -1;
9412
Victor Stinner42bf7752011-11-21 22:52:58 +01009413 kind = PyUnicode_KIND(unicode);
9414 data = PyUnicode_DATA(unicode);
9415
Victor Stinnerb84d7232011-11-22 01:50:07 +01009416 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009417 PyObject *exc;
9418 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009419 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009420 Py_ssize_t startpos;
9421
9422 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009423
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009425 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009426 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009428 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 decimal = Py_UNICODE_TODECIMAL(ch);
9430 if (decimal >= 0) {
9431 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009432 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 continue;
9434 }
9435 if (0 < ch && ch < 256) {
9436 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009437 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 continue;
9439 }
Victor Stinner6345be92011-11-25 20:09:01 +01009440
Victor Stinner42bf7752011-11-21 22:52:58 +01009441 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009442 exc = NULL;
9443 raise_encode_exception(&exc, "decimal", unicode,
9444 startpos, startpos+1,
9445 "invalid decimal Unicode string");
9446 Py_XDECREF(exc);
9447 Py_DECREF(unicode);
9448 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009449 }
9450 /* 0-terminate the output string */
9451 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009452 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009453 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009454}
9455
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456/* --- Helpers ------------------------------------------------------------ */
9457
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009458/* helper macro to fixup start/end slice values */
9459#define ADJUST_INDICES(start, end, len) \
9460 if (end > len) \
9461 end = len; \
9462 else if (end < 0) { \
9463 end += len; \
9464 if (end < 0) \
9465 end = 0; \
9466 } \
9467 if (start < 0) { \
9468 start += len; \
9469 if (start < 0) \
9470 start = 0; \
9471 }
9472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009474any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009476 Py_ssize_t end,
9477 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009479 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009480 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 Py_ssize_t len1, len2, result;
9482
9483 kind1 = PyUnicode_KIND(s1);
9484 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009485 if (kind1 < kind2)
9486 return -1;
9487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 len1 = PyUnicode_GET_LENGTH(s1);
9489 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009490 ADJUST_INDICES(start, end, len1);
9491 if (end - start < len2)
9492 return -1;
9493
9494 buf1 = PyUnicode_DATA(s1);
9495 buf2 = PyUnicode_DATA(s2);
9496 if (len2 == 1) {
9497 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9498 result = findchar((const char *)buf1 + kind1*start,
9499 kind1, end - start, ch, direction);
9500 if (result == -1)
9501 return -1;
9502 else
9503 return start + result;
9504 }
9505
9506 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009507 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009508 if (!buf2)
9509 return -2;
9510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511
Victor Stinner794d5672011-10-10 03:21:36 +02009512 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009513 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009514 case PyUnicode_1BYTE_KIND:
9515 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9516 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9517 else
9518 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9519 break;
9520 case PyUnicode_2BYTE_KIND:
9521 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9522 break;
9523 case PyUnicode_4BYTE_KIND:
9524 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9525 break;
9526 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009527 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009528 }
9529 }
9530 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009531 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009532 case PyUnicode_1BYTE_KIND:
9533 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9534 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9535 else
9536 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9537 break;
9538 case PyUnicode_2BYTE_KIND:
9539 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9540 break;
9541 case PyUnicode_4BYTE_KIND:
9542 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9543 break;
9544 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009545 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 }
9548
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009549 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009550 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009551 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552
9553 return result;
9554}
9555
Victor Stinner59423e32018-11-26 13:40:01 +01009556/* _PyUnicode_InsertThousandsGrouping() helper functions */
9557#include "stringlib/localeutil.h"
9558
9559/**
9560 * InsertThousandsGrouping:
9561 * @writer: Unicode writer.
9562 * @n_buffer: Number of characters in @buffer.
9563 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9564 * @d_pos: Start of digits string.
9565 * @n_digits: The number of digits in the string, in which we want
9566 * to put the grouping chars.
9567 * @min_width: The minimum width of the digits in the output string.
9568 * Output will be zero-padded on the left to fill.
9569 * @grouping: see definition in localeconv().
9570 * @thousands_sep: see definition in localeconv().
9571 *
9572 * There are 2 modes: counting and filling. If @writer is NULL,
9573 * we are in counting mode, else filling mode.
9574 * If counting, the required buffer size is returned.
9575 * If filling, we know the buffer will be large enough, so we don't
9576 * need to pass in the buffer size.
9577 * Inserts thousand grouping characters (as defined by grouping and
9578 * thousands_sep) into @writer.
9579 *
9580 * Return value: -1 on error, number of characters otherwise.
9581 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009583_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009584 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009585 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009586 PyObject *digits,
9587 Py_ssize_t d_pos,
9588 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009589 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009590 const char *grouping,
9591 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009592 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593{
Xtreak3f7983a2019-01-07 20:39:14 +05309594 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009595 if (writer) {
9596 assert(digits != NULL);
9597 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009598 }
9599 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009600 assert(digits == NULL);
9601 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009602 }
Victor Stinner59423e32018-11-26 13:40:01 +01009603 assert(0 <= d_pos);
9604 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009605 assert(grouping != NULL);
9606
9607 if (digits != NULL) {
9608 if (PyUnicode_READY(digits) == -1) {
9609 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009610 }
Victor Stinner59423e32018-11-26 13:40:01 +01009611 }
9612 if (PyUnicode_READY(thousands_sep) == -1) {
9613 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009614 }
9615
Victor Stinner59423e32018-11-26 13:40:01 +01009616 Py_ssize_t count = 0;
9617 Py_ssize_t n_zeros;
9618 int loop_broken = 0;
9619 int use_separator = 0; /* First time through, don't append the
9620 separator. They only go between
9621 groups. */
9622 Py_ssize_t buffer_pos;
9623 Py_ssize_t digits_pos;
9624 Py_ssize_t len;
9625 Py_ssize_t n_chars;
9626 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9627 be looked at */
9628 /* A generator that returns all of the grouping widths, until it
9629 returns 0. */
9630 GroupGenerator groupgen;
9631 GroupGenerator_init(&groupgen, grouping);
9632 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9633
9634 /* if digits are not grouped, thousands separator
9635 should be an empty string */
9636 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9637
9638 digits_pos = d_pos + n_digits;
9639 if (writer) {
9640 buffer_pos = writer->pos + n_buffer;
9641 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9642 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 }
Victor Stinner59423e32018-11-26 13:40:01 +01009644 else {
9645 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009646 }
Victor Stinner59423e32018-11-26 13:40:01 +01009647
9648 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009649 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009650 }
Victor Stinner59423e32018-11-26 13:40:01 +01009651
9652 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9653 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9654 n_zeros = Py_MAX(0, len - remaining);
9655 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9656
9657 /* Use n_zero zero's and n_chars chars */
9658
9659 /* Count only, don't do anything. */
9660 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9661
9662 /* Copy into the writer. */
9663 InsertThousandsGrouping_fill(writer, &buffer_pos,
9664 digits, &digits_pos,
9665 n_chars, n_zeros,
9666 use_separator ? thousands_sep : NULL,
9667 thousands_sep_len, maxchar);
9668
9669 /* Use a separator next time. */
9670 use_separator = 1;
9671
9672 remaining -= n_chars;
9673 min_width -= len;
9674
9675 if (remaining <= 0 && min_width <= 0) {
9676 loop_broken = 1;
9677 break;
9678 }
9679 min_width -= thousands_sep_len;
9680 }
9681 if (!loop_broken) {
9682 /* We left the loop without using a break statement. */
9683
9684 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9685 n_zeros = Py_MAX(0, len - remaining);
9686 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9687
9688 /* Use n_zero zero's and n_chars chars */
9689 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9690
9691 /* Copy into the writer. */
9692 InsertThousandsGrouping_fill(writer, &buffer_pos,
9693 digits, &digits_pos,
9694 n_chars, n_zeros,
9695 use_separator ? thousands_sep : NULL,
9696 thousands_sep_len, maxchar);
9697 }
9698 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699}
9700
9701
Alexander Belopolsky40018472011-02-26 01:02:56 +00009702Py_ssize_t
9703PyUnicode_Count(PyObject *str,
9704 PyObject *substr,
9705 Py_ssize_t start,
9706 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009708 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009709 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009710 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009712
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009713 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009715
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009716 kind1 = PyUnicode_KIND(str);
9717 kind2 = PyUnicode_KIND(substr);
9718 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009719 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009720
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009721 len1 = PyUnicode_GET_LENGTH(str);
9722 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009724 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009725 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009726
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009727 buf1 = PyUnicode_DATA(str);
9728 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009729 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009730 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009731 if (!buf2)
9732 goto onError;
9733 }
9734
9735 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009737 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009738 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009739 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009740 buf2, len2, PY_SSIZE_T_MAX
9741 );
9742 else
9743 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009744 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009745 buf2, len2, PY_SSIZE_T_MAX
9746 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 break;
9748 case PyUnicode_2BYTE_KIND:
9749 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009750 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 buf2, len2, PY_SSIZE_T_MAX
9752 );
9753 break;
9754 case PyUnicode_4BYTE_KIND:
9755 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009756 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 buf2, len2, PY_SSIZE_T_MAX
9758 );
9759 break;
9760 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009761 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009763
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009764 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009765 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009766 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009770 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9771 if (kind2 != kind1)
9772 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774}
9775
Alexander Belopolsky40018472011-02-26 01:02:56 +00009776Py_ssize_t
9777PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009778 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009779 Py_ssize_t start,
9780 Py_ssize_t end,
9781 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009783 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009785
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009786 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787}
9788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789Py_ssize_t
9790PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9791 Py_ssize_t start, Py_ssize_t end,
9792 int direction)
9793{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009795 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 if (PyUnicode_READY(str) == -1)
9797 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009798 len = PyUnicode_GET_LENGTH(str);
9799 ADJUST_INDICES(start, end, len);
9800 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009801 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009803 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9804 kind, end-start, ch, direction);
9805 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009807 else
9808 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809}
9810
Alexander Belopolsky40018472011-02-26 01:02:56 +00009811static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009812tailmatch(PyObject *self,
9813 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009814 Py_ssize_t start,
9815 Py_ssize_t end,
9816 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 int kind_self;
9819 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009820 const void *data_self;
9821 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 Py_ssize_t offset;
9823 Py_ssize_t i;
9824 Py_ssize_t end_sub;
9825
9826 if (PyUnicode_READY(self) == -1 ||
9827 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009828 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9831 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009835 if (PyUnicode_GET_LENGTH(substring) == 0)
9836 return 1;
9837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 kind_self = PyUnicode_KIND(self);
9839 data_self = PyUnicode_DATA(self);
9840 kind_sub = PyUnicode_KIND(substring);
9841 data_sub = PyUnicode_DATA(substring);
9842 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9843
9844 if (direction > 0)
9845 offset = end;
9846 else
9847 offset = start;
9848
9849 if (PyUnicode_READ(kind_self, data_self, offset) ==
9850 PyUnicode_READ(kind_sub, data_sub, 0) &&
9851 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9852 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9853 /* If both are of the same kind, memcmp is sufficient */
9854 if (kind_self == kind_sub) {
9855 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009856 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 data_sub,
9858 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009859 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009861 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 else {
9863 /* We do not need to compare 0 and len(substring)-1 because
9864 the if statement above ensured already that they are equal
9865 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 for (i = 1; i < end_sub; ++i) {
9867 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9868 PyUnicode_READ(kind_sub, data_sub, i))
9869 return 0;
9870 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 }
9874
9875 return 0;
9876}
9877
Alexander Belopolsky40018472011-02-26 01:02:56 +00009878Py_ssize_t
9879PyUnicode_Tailmatch(PyObject *str,
9880 PyObject *substr,
9881 Py_ssize_t start,
9882 Py_ssize_t end,
9883 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009885 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009887
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009888 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889}
9890
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009891static PyObject *
9892ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009894 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009895 const char *data = PyUnicode_DATA(self);
9896 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009897 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009898
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009899 res = PyUnicode_New(len, 127);
9900 if (res == NULL)
9901 return NULL;
9902 resdata = PyUnicode_DATA(res);
9903 if (lower)
9904 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009906 _Py_bytes_upper(resdata, data, len);
9907 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908}
9909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009911handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009913 Py_ssize_t j;
9914 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009915 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009917
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009918 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9919
9920 where ! is a negation and \p{xxx} is a character with property xxx.
9921 */
9922 for (j = i - 1; j >= 0; j--) {
9923 c = PyUnicode_READ(kind, data, j);
9924 if (!_PyUnicode_IsCaseIgnorable(c))
9925 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009927 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9928 if (final_sigma) {
9929 for (j = i + 1; j < length; j++) {
9930 c = PyUnicode_READ(kind, data, j);
9931 if (!_PyUnicode_IsCaseIgnorable(c))
9932 break;
9933 }
9934 final_sigma = j == length || !_PyUnicode_IsCased(c);
9935 }
9936 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937}
9938
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009940lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009941 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009943 /* Obscure special case. */
9944 if (c == 0x3A3) {
9945 mapped[0] = handle_capital_sigma(kind, data, length, i);
9946 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009948 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949}
9950
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009951static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009952do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009954 Py_ssize_t i, k = 0;
9955 int n_res, j;
9956 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009957
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009958 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009959 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009960 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009961 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009962 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009964 for (i = 1; i < length; i++) {
9965 c = PyUnicode_READ(kind, data, i);
9966 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9967 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009968 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009969 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009970 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009971 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009972 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009973}
9974
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009975static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009976do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009977 Py_ssize_t i, k = 0;
9978
9979 for (i = 0; i < length; i++) {
9980 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9981 int n_res, j;
9982 if (Py_UNICODE_ISUPPER(c)) {
9983 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9984 }
9985 else if (Py_UNICODE_ISLOWER(c)) {
9986 n_res = _PyUnicode_ToUpperFull(c, mapped);
9987 }
9988 else {
9989 n_res = 1;
9990 mapped[0] = c;
9991 }
9992 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009993 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009994 res[k++] = mapped[j];
9995 }
9996 }
9997 return k;
9998}
9999
10000static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010001do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010002 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010004 Py_ssize_t i, k = 0;
10005
10006 for (i = 0; i < length; i++) {
10007 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10008 int n_res, j;
10009 if (lower)
10010 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10011 else
10012 n_res = _PyUnicode_ToUpperFull(c, mapped);
10013 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010014 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010015 res[k++] = mapped[j];
10016 }
10017 }
10018 return k;
10019}
10020
10021static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010022do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010023{
10024 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10025}
10026
10027static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010028do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010029{
10030 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10031}
10032
Benjamin Petersone51757f2012-01-12 21:10:29 -050010033static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010034do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010035{
10036 Py_ssize_t i, k = 0;
10037
10038 for (i = 0; i < length; i++) {
10039 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10040 Py_UCS4 mapped[3];
10041 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10042 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010043 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010044 res[k++] = mapped[j];
10045 }
10046 }
10047 return k;
10048}
10049
10050static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010051do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010052{
10053 Py_ssize_t i, k = 0;
10054 int previous_is_cased;
10055
10056 previous_is_cased = 0;
10057 for (i = 0; i < length; i++) {
10058 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10059 Py_UCS4 mapped[3];
10060 int n_res, j;
10061
10062 if (previous_is_cased)
10063 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10064 else
10065 n_res = _PyUnicode_ToTitleFull(c, mapped);
10066
10067 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010068 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010069 res[k++] = mapped[j];
10070 }
10071
10072 previous_is_cased = _PyUnicode_IsCased(c);
10073 }
10074 return k;
10075}
10076
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077static PyObject *
10078case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010079 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010080{
10081 PyObject *res = NULL;
10082 Py_ssize_t length, newlength = 0;
10083 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010084 const void *data;
10085 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10087
Benjamin Petersoneea48462012-01-16 14:28:50 -050010088 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010089
10090 kind = PyUnicode_KIND(self);
10091 data = PyUnicode_DATA(self);
10092 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010093 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010094 PyErr_SetString(PyExc_OverflowError, "string is too long");
10095 return NULL;
10096 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010097 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010098 if (tmp == NULL)
10099 return PyErr_NoMemory();
10100 newlength = perform(kind, data, length, tmp, &maxchar);
10101 res = PyUnicode_New(newlength, maxchar);
10102 if (res == NULL)
10103 goto leave;
10104 tmpend = tmp + newlength;
10105 outdata = PyUnicode_DATA(res);
10106 outkind = PyUnicode_KIND(res);
10107 switch (outkind) {
10108 case PyUnicode_1BYTE_KIND:
10109 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10110 break;
10111 case PyUnicode_2BYTE_KIND:
10112 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10113 break;
10114 case PyUnicode_4BYTE_KIND:
10115 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10116 break;
10117 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010118 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010119 }
10120 leave:
10121 PyMem_FREE(tmp);
10122 return res;
10123}
10124
Tim Peters8ce9f162004-08-27 01:49:32 +000010125PyObject *
10126PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010128 PyObject *res;
10129 PyObject *fseq;
10130 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010131 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010133 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010134 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010135 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010136 }
10137
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010138 /* NOTE: the following code can't call back into Python code,
10139 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010140 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010141
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010142 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010143 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010144 res = _PyUnicode_JoinArray(separator, items, seqlen);
10145 Py_DECREF(fseq);
10146 return res;
10147}
10148
10149PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010150_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010151{
10152 PyObject *res = NULL; /* the result */
10153 PyObject *sep = NULL;
10154 Py_ssize_t seplen;
10155 PyObject *item;
10156 Py_ssize_t sz, i, res_offset;
10157 Py_UCS4 maxchar;
10158 Py_UCS4 item_maxchar;
10159 int use_memcpy;
10160 unsigned char *res_data = NULL, *sep_data = NULL;
10161 PyObject *last_obj;
10162 unsigned int kind = 0;
10163
Tim Peters05eba1f2004-08-27 21:32:02 +000010164 /* If empty sequence, return u"". */
10165 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010166 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010167 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010168
Tim Peters05eba1f2004-08-27 21:32:02 +000010169 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010170 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010171 if (seqlen == 1) {
10172 if (PyUnicode_CheckExact(items[0])) {
10173 res = items[0];
10174 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010175 return res;
10176 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010177 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010178 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010179 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010180 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010181 /* Set up sep and seplen */
10182 if (separator == NULL) {
10183 /* fall back to a blank space separator */
10184 sep = PyUnicode_FromOrdinal(' ');
10185 if (!sep)
10186 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010187 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010188 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010189 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010190 else {
10191 if (!PyUnicode_Check(separator)) {
10192 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010193 "separator: expected str instance,"
10194 " %.80s found",
10195 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010196 goto onError;
10197 }
10198 if (PyUnicode_READY(separator))
10199 goto onError;
10200 sep = separator;
10201 seplen = PyUnicode_GET_LENGTH(separator);
10202 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10203 /* inc refcount to keep this code path symmetric with the
10204 above case of a blank separator */
10205 Py_INCREF(sep);
10206 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010207 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010208 }
10209
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010210 /* There are at least two things to join, or else we have a subclass
10211 * of str in the sequence.
10212 * Do a pre-pass to figure out the total amount of space we'll
10213 * need (sz), and see whether all argument are strings.
10214 */
10215 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010216#ifdef Py_DEBUG
10217 use_memcpy = 0;
10218#else
10219 use_memcpy = 1;
10220#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010221 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010222 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010223 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 if (!PyUnicode_Check(item)) {
10225 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010226 "sequence item %zd: expected str instance,"
10227 " %.80s found",
10228 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010229 goto onError;
10230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 if (PyUnicode_READY(item) == -1)
10232 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010233 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010235 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010236 if (i != 0) {
10237 add_sz += seplen;
10238 }
10239 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010240 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010241 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010242 goto onError;
10243 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010244 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010245 if (use_memcpy && last_obj != NULL) {
10246 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10247 use_memcpy = 0;
10248 }
10249 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010250 }
Tim Petersced69f82003-09-16 20:30:58 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010253 if (res == NULL)
10254 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010255
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010256 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010257#ifdef Py_DEBUG
10258 use_memcpy = 0;
10259#else
10260 if (use_memcpy) {
10261 res_data = PyUnicode_1BYTE_DATA(res);
10262 kind = PyUnicode_KIND(res);
10263 if (seplen != 0)
10264 sep_data = PyUnicode_1BYTE_DATA(sep);
10265 }
10266#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010267 if (use_memcpy) {
10268 for (i = 0; i < seqlen; ++i) {
10269 Py_ssize_t itemlen;
10270 item = items[i];
10271
10272 /* Copy item, and maybe the separator. */
10273 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010274 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010275 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010276 kind * seplen);
10277 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010278 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010279
10280 itemlen = PyUnicode_GET_LENGTH(item);
10281 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010282 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010283 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010284 kind * itemlen);
10285 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010286 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010287 }
10288 assert(res_data == PyUnicode_1BYTE_DATA(res)
10289 + kind * PyUnicode_GET_LENGTH(res));
10290 }
10291 else {
10292 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10293 Py_ssize_t itemlen;
10294 item = items[i];
10295
10296 /* Copy item, and maybe the separator. */
10297 if (i && seplen != 0) {
10298 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10299 res_offset += seplen;
10300 }
10301
10302 itemlen = PyUnicode_GET_LENGTH(item);
10303 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010304 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010305 res_offset += itemlen;
10306 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010307 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010308 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010309 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010312 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010317 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318 return NULL;
10319}
10320
Victor Stinnerd3f08822012-05-29 12:57:52 +020010321void
10322_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10323 Py_UCS4 fill_char)
10324{
10325 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010326 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010327 assert(PyUnicode_IS_READY(unicode));
10328 assert(unicode_modifiable(unicode));
10329 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10330 assert(start >= 0);
10331 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010332 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010333}
10334
Victor Stinner3fe55312012-01-04 00:33:50 +010010335Py_ssize_t
10336PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10337 Py_UCS4 fill_char)
10338{
10339 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010340
10341 if (!PyUnicode_Check(unicode)) {
10342 PyErr_BadInternalCall();
10343 return -1;
10344 }
10345 if (PyUnicode_READY(unicode) == -1)
10346 return -1;
10347 if (unicode_check_modifiable(unicode))
10348 return -1;
10349
Victor Stinnerd3f08822012-05-29 12:57:52 +020010350 if (start < 0) {
10351 PyErr_SetString(PyExc_IndexError, "string index out of range");
10352 return -1;
10353 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010354 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10355 PyErr_SetString(PyExc_ValueError,
10356 "fill character is bigger than "
10357 "the string maximum character");
10358 return -1;
10359 }
10360
10361 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10362 length = Py_MIN(maxlen, length);
10363 if (length <= 0)
10364 return 0;
10365
Victor Stinnerd3f08822012-05-29 12:57:52 +020010366 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010367 return length;
10368}
10369
Victor Stinner9310abb2011-10-05 00:59:23 +020010370static PyObject *
10371pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010372 Py_ssize_t left,
10373 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 PyObject *u;
10377 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010378 int kind;
10379 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
10381 if (left < 0)
10382 left = 0;
10383 if (right < 0)
10384 right = 0;
10385
Victor Stinnerc4b49542011-12-11 22:44:26 +010010386 if (left == 0 && right == 0)
10387 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10390 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010391 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10392 return NULL;
10393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010395 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010397 if (!u)
10398 return NULL;
10399
10400 kind = PyUnicode_KIND(u);
10401 data = PyUnicode_DATA(u);
10402 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010403 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010404 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010405 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010406 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010407 assert(_PyUnicode_CheckConsistency(u, 1));
10408 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409}
10410
Alexander Belopolsky40018472011-02-26 01:02:56 +000010411PyObject *
10412PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010416 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
Benjamin Petersonead6b532011-12-20 17:23:42 -060010419 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421 if (PyUnicode_IS_ASCII(string))
10422 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010423 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010424 PyUnicode_GET_LENGTH(string), keepends);
10425 else
10426 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010427 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010428 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 break;
10430 case PyUnicode_2BYTE_KIND:
10431 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010432 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 PyUnicode_GET_LENGTH(string), keepends);
10434 break;
10435 case PyUnicode_4BYTE_KIND:
10436 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010437 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 PyUnicode_GET_LENGTH(string), keepends);
10439 break;
10440 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010441 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444}
10445
Alexander Belopolsky40018472011-02-26 01:02:56 +000010446static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010447split(PyObject *self,
10448 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010449 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010451 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010452 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 Py_ssize_t len1, len2;
10454 PyObject* out;
10455
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010457 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 if (PyUnicode_READY(self) == -1)
10460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010463 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010465 if (PyUnicode_IS_ASCII(self))
10466 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010467 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010468 PyUnicode_GET_LENGTH(self), maxcount
10469 );
10470 else
10471 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010472 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010473 PyUnicode_GET_LENGTH(self), maxcount
10474 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 case PyUnicode_2BYTE_KIND:
10476 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 PyUnicode_GET_LENGTH(self), maxcount
10479 );
10480 case PyUnicode_4BYTE_KIND:
10481 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010482 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 PyUnicode_GET_LENGTH(self), maxcount
10484 );
10485 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010486 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 }
10488
10489 if (PyUnicode_READY(substring) == -1)
10490 return NULL;
10491
10492 kind1 = PyUnicode_KIND(self);
10493 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 len1 = PyUnicode_GET_LENGTH(self);
10495 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010496 if (kind1 < kind2 || len1 < len2) {
10497 out = PyList_New(1);
10498 if (out == NULL)
10499 return NULL;
10500 Py_INCREF(self);
10501 PyList_SET_ITEM(out, 0, self);
10502 return out;
10503 }
10504 buf1 = PyUnicode_DATA(self);
10505 buf2 = PyUnicode_DATA(substring);
10506 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010507 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010508 if (!buf2)
10509 return NULL;
10510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010512 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010514 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10515 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010516 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010517 else
10518 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010519 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 break;
10521 case PyUnicode_2BYTE_KIND:
10522 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010523 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 break;
10525 case PyUnicode_4BYTE_KIND:
10526 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010527 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 break;
10529 default:
10530 out = NULL;
10531 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010532 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010533 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010534 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536}
10537
Alexander Belopolsky40018472011-02-26 01:02:56 +000010538static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010539rsplit(PyObject *self,
10540 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010541 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010542{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010543 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010544 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 Py_ssize_t len1, len2;
10546 PyObject* out;
10547
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010548 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010549 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (PyUnicode_READY(self) == -1)
10552 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010555 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010557 if (PyUnicode_IS_ASCII(self))
10558 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010559 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010560 PyUnicode_GET_LENGTH(self), maxcount
10561 );
10562 else
10563 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010564 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010565 PyUnicode_GET_LENGTH(self), maxcount
10566 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 case PyUnicode_2BYTE_KIND:
10568 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010569 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 PyUnicode_GET_LENGTH(self), maxcount
10571 );
10572 case PyUnicode_4BYTE_KIND:
10573 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010574 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 PyUnicode_GET_LENGTH(self), maxcount
10576 );
10577 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010578 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 }
10580
10581 if (PyUnicode_READY(substring) == -1)
10582 return NULL;
10583
10584 kind1 = PyUnicode_KIND(self);
10585 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 len1 = PyUnicode_GET_LENGTH(self);
10587 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010588 if (kind1 < kind2 || len1 < len2) {
10589 out = PyList_New(1);
10590 if (out == NULL)
10591 return NULL;
10592 Py_INCREF(self);
10593 PyList_SET_ITEM(out, 0, self);
10594 return out;
10595 }
10596 buf1 = PyUnicode_DATA(self);
10597 buf2 = PyUnicode_DATA(substring);
10598 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010599 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010600 if (!buf2)
10601 return NULL;
10602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010604 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010606 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10607 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010608 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010609 else
10610 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010611 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 break;
10613 case PyUnicode_2BYTE_KIND:
10614 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010615 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 break;
10617 case PyUnicode_4BYTE_KIND:
10618 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010619 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 break;
10621 default:
10622 out = NULL;
10623 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010624 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010625 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010626 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 return out;
10628}
10629
10630static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010631anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10632 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010634 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010636 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10637 return asciilib_find(buf1, len1, buf2, len2, offset);
10638 else
10639 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 case PyUnicode_2BYTE_KIND:
10641 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10642 case PyUnicode_4BYTE_KIND:
10643 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10644 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010645 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646}
10647
10648static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010649anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10650 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010652 switch (kind) {
10653 case PyUnicode_1BYTE_KIND:
10654 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10655 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10656 else
10657 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10658 case PyUnicode_2BYTE_KIND:
10659 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10660 case PyUnicode_4BYTE_KIND:
10661 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10662 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010663 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010664}
10665
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010666static void
10667replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10668 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10669{
10670 int kind = PyUnicode_KIND(u);
10671 void *data = PyUnicode_DATA(u);
10672 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10673 if (kind == PyUnicode_1BYTE_KIND) {
10674 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10675 (Py_UCS1 *)data + len,
10676 u1, u2, maxcount);
10677 }
10678 else if (kind == PyUnicode_2BYTE_KIND) {
10679 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10680 (Py_UCS2 *)data + len,
10681 u1, u2, maxcount);
10682 }
10683 else {
10684 assert(kind == PyUnicode_4BYTE_KIND);
10685 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10686 (Py_UCS4 *)data + len,
10687 u1, u2, maxcount);
10688 }
10689}
10690
Alexander Belopolsky40018472011-02-26 01:02:56 +000010691static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692replace(PyObject *self, PyObject *str1,
10693 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010696 const char *sbuf = PyUnicode_DATA(self);
10697 const void *buf1 = PyUnicode_DATA(str1);
10698 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 int srelease = 0, release1 = 0, release2 = 0;
10700 int skind = PyUnicode_KIND(self);
10701 int kind1 = PyUnicode_KIND(str1);
10702 int kind2 = PyUnicode_KIND(str2);
10703 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10704 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10705 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010706 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010707 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010709 if (slen < len1)
10710 goto nothing;
10711
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010714 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010715 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
Victor Stinner59de0ee2011-10-07 10:01:28 +020010717 if (str1 == str2)
10718 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719
Victor Stinner49a0a212011-10-12 23:46:10 +020010720 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010721 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10722 if (maxchar < maxchar_str1)
10723 /* substring too wide to be present */
10724 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010725 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10726 /* Replacing str1 with str2 may cause a maxchar reduction in the
10727 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010728 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010729 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010734 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010736 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010737 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010738 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010739
Victor Stinner69ed0f42013-04-09 21:48:24 +020010740 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010741 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010742 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010743 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010744 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010746 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010748
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010749 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10750 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010751 }
10752 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 int rkind = skind;
10754 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010755 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 if (kind1 < rkind) {
10758 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010759 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 if (!buf1) goto error;
10761 release1 = 1;
10762 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010763 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010764 if (i < 0)
10765 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 if (rkind > kind2) {
10767 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010768 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 if (!buf2) goto error;
10770 release2 = 1;
10771 }
10772 else if (rkind < kind2) {
10773 /* widen self and buf1 */
10774 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010775 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010776 assert(buf1 != PyUnicode_DATA(str1));
10777 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010778 buf1 = PyUnicode_DATA(str1);
10779 release1 = 0;
10780 }
10781 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (!sbuf) goto error;
10783 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010784 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 if (!buf1) goto error;
10786 release1 = 1;
10787 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010788 u = PyUnicode_New(slen, maxchar);
10789 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010791 assert(PyUnicode_KIND(u) == rkind);
10792 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010793
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010794 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010795 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010796 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010798 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010800
10801 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010802 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010803 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010804 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010805 if (i == -1)
10806 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010807 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010809 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010813 }
10814 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010816 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 int rkind = skind;
10818 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010821 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010822 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 if (!buf1) goto error;
10824 release1 = 1;
10825 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010826 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010827 if (n == 0)
10828 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010830 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010831 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (!buf2) goto error;
10833 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010836 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010838 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 if (!sbuf) goto error;
10840 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010841 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010842 assert(buf1 != PyUnicode_DATA(str1));
10843 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010844 buf1 = PyUnicode_DATA(str1);
10845 release1 = 0;
10846 }
10847 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (!buf1) goto error;
10849 release1 = 1;
10850 }
10851 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10852 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010853 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 PyErr_SetString(PyExc_OverflowError,
10855 "replace string is too long");
10856 goto error;
10857 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010858 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010859 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010860 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010861 goto done;
10862 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010863 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 PyErr_SetString(PyExc_OverflowError,
10865 "replace string is too long");
10866 goto error;
10867 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010868 u = PyUnicode_New(new_size, maxchar);
10869 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010871 assert(PyUnicode_KIND(u) == rkind);
10872 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 ires = i = 0;
10874 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 while (n-- > 0) {
10876 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010877 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010878 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010879 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010880 if (j == -1)
10881 break;
10882 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010883 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010884 memcpy(res + rkind * ires,
10885 sbuf + rkind * i,
10886 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010888 }
10889 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010891 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010893 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010899 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010900 memcpy(res + rkind * ires,
10901 sbuf + rkind * i,
10902 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010903 }
10904 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010905 /* interleave */
10906 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010907 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010909 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911 if (--n <= 0)
10912 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010913 memcpy(res + rkind * ires,
10914 sbuf + rkind * i,
10915 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 ires++;
10917 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010918 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010919 memcpy(res + rkind * ires,
10920 sbuf + rkind * i,
10921 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010923 }
10924
10925 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010926 unicode_adjust_maxchar(&u);
10927 if (u == NULL)
10928 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010930
10931 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010932 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10933 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10934 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010936 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010938 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010940 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010941 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010943
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010945 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010946 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10947 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10948 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010950 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010952 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010954 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010955 return unicode_result_unchanged(self);
10956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010958 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10959 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10960 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10961 if (srelease)
10962 PyMem_FREE((void *)sbuf);
10963 if (release1)
10964 PyMem_FREE((void *)buf1);
10965 if (release2)
10966 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968}
10969
10970/* --- Unicode Object Methods --------------------------------------------- */
10971
INADA Naoki3ae20562017-01-16 20:41:20 +090010972/*[clinic input]
10973str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
INADA Naoki3ae20562017-01-16 20:41:20 +090010975Return a version of the string where each word is titlecased.
10976
10977More specifically, words start with uppercased characters and all remaining
10978cased characters have lower case.
10979[clinic start generated code]*/
10980
10981static PyObject *
10982unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010983/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010985 if (PyUnicode_READY(self) == -1)
10986 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010987 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988}
10989
INADA Naoki3ae20562017-01-16 20:41:20 +090010990/*[clinic input]
10991str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
INADA Naoki3ae20562017-01-16 20:41:20 +090010993Return a capitalized version of the string.
10994
10995More specifically, make the first character have upper case and the rest lower
10996case.
10997[clinic start generated code]*/
10998
10999static PyObject *
11000unicode_capitalize_impl(PyObject *self)
11001/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011003 if (PyUnicode_READY(self) == -1)
11004 return NULL;
11005 if (PyUnicode_GET_LENGTH(self) == 0)
11006 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011007 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008}
11009
INADA Naoki3ae20562017-01-16 20:41:20 +090011010/*[clinic input]
11011str.casefold as unicode_casefold
11012
11013Return a version of the string suitable for caseless comparisons.
11014[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011015
11016static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011017unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011018/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011019{
11020 if (PyUnicode_READY(self) == -1)
11021 return NULL;
11022 if (PyUnicode_IS_ASCII(self))
11023 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011024 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011025}
11026
11027
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011028/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011029
11030static int
11031convert_uc(PyObject *obj, void *addr)
11032{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011034
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011035 if (!PyUnicode_Check(obj)) {
11036 PyErr_Format(PyExc_TypeError,
11037 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011038 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011039 return 0;
11040 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011041 if (PyUnicode_READY(obj) < 0)
11042 return 0;
11043 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011044 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011046 return 0;
11047 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011048 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011049 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011050}
11051
INADA Naoki3ae20562017-01-16 20:41:20 +090011052/*[clinic input]
11053str.center as unicode_center
11054
11055 width: Py_ssize_t
11056 fillchar: Py_UCS4 = ' '
11057 /
11058
11059Return a centered string of length width.
11060
11061Padding is done using the specified fill character (default is a space).
11062[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
11064static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011065unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11066/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011068 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
Benjamin Petersonbac79492012-01-14 13:34:47 -050011070 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 return NULL;
11072
Victor Stinnerc4b49542011-12-11 22:44:26 +010011073 if (PyUnicode_GET_LENGTH(self) >= width)
11074 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
Victor Stinnerc4b49542011-12-11 22:44:26 +010011076 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077 left = marg / 2 + (marg & width & 1);
11078
Victor Stinner9310abb2011-10-05 00:59:23 +020011079 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080}
11081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082/* This function assumes that str1 and str2 are readied by the caller. */
11083
Marc-André Lemburge5034372000-08-08 08:04:29 +000011084static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011085unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011086{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011087#define COMPARE(TYPE1, TYPE2) \
11088 do { \
11089 TYPE1* p1 = (TYPE1 *)data1; \
11090 TYPE2* p2 = (TYPE2 *)data2; \
11091 TYPE1* end = p1 + len; \
11092 Py_UCS4 c1, c2; \
11093 for (; p1 != end; p1++, p2++) { \
11094 c1 = *p1; \
11095 c2 = *p2; \
11096 if (c1 != c2) \
11097 return (c1 < c2) ? -1 : 1; \
11098 } \
11099 } \
11100 while (0)
11101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011103 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011104 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 kind1 = PyUnicode_KIND(str1);
11107 kind2 = PyUnicode_KIND(str2);
11108 data1 = PyUnicode_DATA(str1);
11109 data2 = PyUnicode_DATA(str2);
11110 len1 = PyUnicode_GET_LENGTH(str1);
11111 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011112 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011113
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011114 switch(kind1) {
11115 case PyUnicode_1BYTE_KIND:
11116 {
11117 switch(kind2) {
11118 case PyUnicode_1BYTE_KIND:
11119 {
11120 int cmp = memcmp(data1, data2, len);
11121 /* normalize result of memcmp() into the range [-1; 1] */
11122 if (cmp < 0)
11123 return -1;
11124 if (cmp > 0)
11125 return 1;
11126 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011127 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011128 case PyUnicode_2BYTE_KIND:
11129 COMPARE(Py_UCS1, Py_UCS2);
11130 break;
11131 case PyUnicode_4BYTE_KIND:
11132 COMPARE(Py_UCS1, Py_UCS4);
11133 break;
11134 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011135 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011136 }
11137 break;
11138 }
11139 case PyUnicode_2BYTE_KIND:
11140 {
11141 switch(kind2) {
11142 case PyUnicode_1BYTE_KIND:
11143 COMPARE(Py_UCS2, Py_UCS1);
11144 break;
11145 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011146 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011147 COMPARE(Py_UCS2, Py_UCS2);
11148 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011149 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011150 case PyUnicode_4BYTE_KIND:
11151 COMPARE(Py_UCS2, Py_UCS4);
11152 break;
11153 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011154 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011155 }
11156 break;
11157 }
11158 case PyUnicode_4BYTE_KIND:
11159 {
11160 switch(kind2) {
11161 case PyUnicode_1BYTE_KIND:
11162 COMPARE(Py_UCS4, Py_UCS1);
11163 break;
11164 case PyUnicode_2BYTE_KIND:
11165 COMPARE(Py_UCS4, Py_UCS2);
11166 break;
11167 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011168 {
11169#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11170 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11171 /* normalize result of wmemcmp() into the range [-1; 1] */
11172 if (cmp < 0)
11173 return -1;
11174 if (cmp > 0)
11175 return 1;
11176#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011177 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011178#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011179 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011180 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011181 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011182 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011183 }
11184 break;
11185 }
11186 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011187 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011188 }
11189
Victor Stinner770e19e2012-10-04 22:59:45 +020011190 if (len1 == len2)
11191 return 0;
11192 if (len1 < len2)
11193 return -1;
11194 else
11195 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011196
11197#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011198}
11199
Benjamin Peterson621b4302016-09-09 13:54:34 -070011200static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011201unicode_compare_eq(PyObject *str1, PyObject *str2)
11202{
11203 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011204 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011205 Py_ssize_t len;
11206 int cmp;
11207
Victor Stinnere5567ad2012-10-23 02:48:49 +020011208 len = PyUnicode_GET_LENGTH(str1);
11209 if (PyUnicode_GET_LENGTH(str2) != len)
11210 return 0;
11211 kind = PyUnicode_KIND(str1);
11212 if (PyUnicode_KIND(str2) != kind)
11213 return 0;
11214 data1 = PyUnicode_DATA(str1);
11215 data2 = PyUnicode_DATA(str2);
11216
11217 cmp = memcmp(data1, data2, len * kind);
11218 return (cmp == 0);
11219}
11220
11221
Alexander Belopolsky40018472011-02-26 01:02:56 +000011222int
11223PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11226 if (PyUnicode_READY(left) == -1 ||
11227 PyUnicode_READY(right) == -1)
11228 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011229
11230 /* a string is equal to itself */
11231 if (left == right)
11232 return 0;
11233
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011234 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011236 PyErr_Format(PyExc_TypeError,
11237 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011238 Py_TYPE(left)->tp_name,
11239 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 return -1;
11241}
11242
Martin v. Löwis5b222132007-06-10 09:51:05 +000011243int
11244PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 Py_ssize_t i;
11247 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011249 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250
Victor Stinner910337b2011-10-03 03:20:16 +020011251 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011252 if (!PyUnicode_IS_READY(uni)) {
11253 const wchar_t *ws = _PyUnicode_WSTR(uni);
11254 /* Compare Unicode string and source character set string */
11255 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11256 if (chr != ustr[i])
11257 return (chr < ustr[i]) ? -1 : 1;
11258 }
11259 /* This check keeps Python strings that end in '\0' from comparing equal
11260 to C strings identical up to that point. */
11261 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11262 return 1; /* uni is longer */
11263 if (ustr[i])
11264 return -1; /* str is longer */
11265 return 0;
11266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011268 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011269 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011270 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011271 size_t len, len2 = strlen(str);
11272 int cmp;
11273
11274 len = Py_MIN(len1, len2);
11275 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011276 if (cmp != 0) {
11277 if (cmp < 0)
11278 return -1;
11279 else
11280 return 1;
11281 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011282 if (len1 > len2)
11283 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011284 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011285 return -1; /* str is longer */
11286 return 0;
11287 }
11288 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011289 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011290 /* Compare Unicode string and source character set string */
11291 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011292 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011293 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11294 /* This check keeps Python strings that end in '\0' from comparing equal
11295 to C strings identical up to that point. */
11296 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11297 return 1; /* uni is longer */
11298 if (str[i])
11299 return -1; /* str is longer */
11300 return 0;
11301 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011302}
11303
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011304static int
11305non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11306{
11307 size_t i, len;
11308 const wchar_t *p;
11309 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11310 if (strlen(str) != len)
11311 return 0;
11312 p = _PyUnicode_WSTR(unicode);
11313 assert(p);
11314 for (i = 0; i < len; i++) {
11315 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011316 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011317 return 0;
11318 }
11319 return 1;
11320}
11321
11322int
11323_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11324{
11325 size_t len;
11326 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011327 assert(str);
11328#ifndef NDEBUG
11329 for (const char *p = str; *p; p++) {
11330 assert((unsigned char)*p < 128);
11331 }
11332#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011333 if (PyUnicode_READY(unicode) == -1) {
11334 /* Memory error or bad data */
11335 PyErr_Clear();
11336 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11337 }
11338 if (!PyUnicode_IS_ASCII(unicode))
11339 return 0;
11340 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11341 return strlen(str) == len &&
11342 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11343}
11344
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011345int
11346_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11347{
11348 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011349
11350 assert(_PyUnicode_CHECK(left));
11351 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011352#ifndef NDEBUG
11353 for (const char *p = right->string; *p; p++) {
11354 assert((unsigned char)*p < 128);
11355 }
11356#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011357
11358 if (PyUnicode_READY(left) == -1) {
11359 /* memory error or bad data */
11360 PyErr_Clear();
11361 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11362 }
11363
11364 if (!PyUnicode_IS_ASCII(left))
11365 return 0;
11366
11367 right_uni = _PyUnicode_FromId(right); /* borrowed */
11368 if (right_uni == NULL) {
11369 /* memory error or bad data */
11370 PyErr_Clear();
11371 return _PyUnicode_EqualToASCIIString(left, right->string);
11372 }
11373
11374 if (left == right_uni)
11375 return 1;
11376
11377 if (PyUnicode_CHECK_INTERNED(left))
11378 return 0;
11379
Victor Stinner607b1022020-05-05 18:50:30 +020011380#ifdef INTERNED_STRINGS
INADA Naoki7cc95f52018-01-28 02:07:09 +090011381 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011382 Py_hash_t hash = _PyUnicode_HASH(left);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011383 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11384 return 0;
Victor Stinner607b1022020-05-05 18:50:30 +020011385#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011386
11387 return unicode_compare_eq(left, right_uni);
11388}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011389
Alexander Belopolsky40018472011-02-26 01:02:56 +000011390PyObject *
11391PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011392{
11393 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011394
Victor Stinnere5567ad2012-10-23 02:48:49 +020011395 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11396 Py_RETURN_NOTIMPLEMENTED;
11397
11398 if (PyUnicode_READY(left) == -1 ||
11399 PyUnicode_READY(right) == -1)
11400 return NULL;
11401
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011402 if (left == right) {
11403 switch (op) {
11404 case Py_EQ:
11405 case Py_LE:
11406 case Py_GE:
11407 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011408 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011409 case Py_NE:
11410 case Py_LT:
11411 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011412 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011413 default:
11414 PyErr_BadArgument();
11415 return NULL;
11416 }
11417 }
11418 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011419 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011420 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011421 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011422 }
11423 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011424 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011425 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011426 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011427}
11428
Alexander Belopolsky40018472011-02-26 01:02:56 +000011429int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011430_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11431{
11432 return unicode_eq(aa, bb);
11433}
11434
11435int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011436PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011437{
Victor Stinner77282cb2013-04-14 19:22:47 +020011438 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011439 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011441 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011442
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011443 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011445 "'in <string>' requires string as left operand, not %.100s",
11446 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011447 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011448 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011450 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011451 if (ensure_unicode(str) < 0)
11452 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455 kind2 = PyUnicode_KIND(substr);
11456 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011457 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011459 len2 = PyUnicode_GET_LENGTH(substr);
11460 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011461 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011462 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011463 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011464 if (len2 == 1) {
11465 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11466 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011467 return result;
11468 }
11469 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011470 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011471 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011472 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474
Victor Stinner77282cb2013-04-14 19:22:47 +020011475 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 case PyUnicode_1BYTE_KIND:
11477 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11478 break;
11479 case PyUnicode_2BYTE_KIND:
11480 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11481 break;
11482 case PyUnicode_4BYTE_KIND:
11483 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11484 break;
11485 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011486 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011488
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011489 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011490 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011491 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492
Guido van Rossum403d68b2000-03-13 15:55:09 +000011493 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011494}
11495
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496/* Concat to string or Unicode object giving a new Unicode object. */
11497
Alexander Belopolsky40018472011-02-26 01:02:56 +000011498PyObject *
11499PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011501 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011502 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011503 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011505 if (ensure_unicode(left) < 0)
11506 return NULL;
11507
11508 if (!PyUnicode_Check(right)) {
11509 PyErr_Format(PyExc_TypeError,
11510 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011511 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011512 return NULL;
11513 }
11514 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
11517 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011518 PyObject *empty = unicode_get_empty(); // Borrowed reference
11519 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011520 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011521 }
11522 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011524 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011526 left_len = PyUnicode_GET_LENGTH(left);
11527 right_len = PyUnicode_GET_LENGTH(right);
11528 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011529 PyErr_SetString(PyExc_OverflowError,
11530 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011531 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011532 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011533 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011534
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011535 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11536 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011537 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011540 result = PyUnicode_New(new_len, maxchar);
11541 if (result == NULL)
11542 return NULL;
11543 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11544 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11545 assert(_PyUnicode_CheckConsistency(result, 1));
11546 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547}
11548
Walter Dörwald1ab83302007-05-18 17:15:44 +000011549void
Victor Stinner23e56682011-10-03 03:54:37 +020011550PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011551{
Victor Stinner23e56682011-10-03 03:54:37 +020011552 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011553 Py_UCS4 maxchar, maxchar2;
11554 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011555
11556 if (p_left == NULL) {
11557 if (!PyErr_Occurred())
11558 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011559 return;
11560 }
Victor Stinner23e56682011-10-03 03:54:37 +020011561 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011562 if (right == NULL || left == NULL
11563 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011564 if (!PyErr_Occurred())
11565 PyErr_BadInternalCall();
11566 goto error;
11567 }
11568
Benjamin Petersonbac79492012-01-14 13:34:47 -050011569 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011570 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011571 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011572 goto error;
11573
Victor Stinner488fa492011-12-12 00:01:39 +010011574 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011575 PyObject *empty = unicode_get_empty(); // Borrowed reference
11576 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011577 Py_DECREF(left);
11578 Py_INCREF(right);
11579 *p_left = right;
11580 return;
11581 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011582 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011583 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011584 }
Victor Stinner488fa492011-12-12 00:01:39 +010011585
11586 left_len = PyUnicode_GET_LENGTH(left);
11587 right_len = PyUnicode_GET_LENGTH(right);
11588 if (left_len > PY_SSIZE_T_MAX - right_len) {
11589 PyErr_SetString(PyExc_OverflowError,
11590 "strings are too large to concat");
11591 goto error;
11592 }
11593 new_len = left_len + right_len;
11594
11595 if (unicode_modifiable(left)
11596 && PyUnicode_CheckExact(right)
11597 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011598 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11599 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011600 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011601 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011602 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11603 {
11604 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011605 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011606 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011607
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011608 /* copy 'right' into the newly allocated area of 'left' */
11609 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011610 }
Victor Stinner488fa492011-12-12 00:01:39 +010011611 else {
11612 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11613 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011614 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011615
Victor Stinner488fa492011-12-12 00:01:39 +010011616 /* Concat the two Unicode strings */
11617 res = PyUnicode_New(new_len, maxchar);
11618 if (res == NULL)
11619 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011620 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11621 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011622 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011623 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011624 }
11625 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011626 return;
11627
11628error:
Victor Stinner488fa492011-12-12 00:01:39 +010011629 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011630}
11631
11632void
11633PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11634{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011635 PyUnicode_Append(pleft, right);
11636 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011637}
11638
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011639/*
11640Wraps stringlib_parse_args_finds() and additionally ensures that the
11641first argument is a unicode object.
11642*/
11643
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011644static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011645parse_args_finds_unicode(const char * function_name, PyObject *args,
11646 PyObject **substring,
11647 Py_ssize_t *start, Py_ssize_t *end)
11648{
11649 if(stringlib_parse_args_finds(function_name, args, substring,
11650 start, end)) {
11651 if (ensure_unicode(*substring) < 0)
11652 return 0;
11653 return 1;
11654 }
11655 return 0;
11656}
11657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011661Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011662string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011663interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664
11665static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011668 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011669 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011670 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011672 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011673 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011676 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 kind1 = PyUnicode_KIND(self);
11680 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011681 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011682 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 len1 = PyUnicode_GET_LENGTH(self);
11685 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011687 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011688 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011689
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011690 buf1 = PyUnicode_DATA(self);
11691 buf2 = PyUnicode_DATA(substring);
11692 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011693 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011694 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011695 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011696 }
11697 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 case PyUnicode_1BYTE_KIND:
11699 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011700 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 buf2, len2, PY_SSIZE_T_MAX
11702 );
11703 break;
11704 case PyUnicode_2BYTE_KIND:
11705 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011706 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 buf2, len2, PY_SSIZE_T_MAX
11708 );
11709 break;
11710 case PyUnicode_4BYTE_KIND:
11711 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011712 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 buf2, len2, PY_SSIZE_T_MAX
11714 );
11715 break;
11716 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011717 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 }
11719
11720 result = PyLong_FromSsize_t(iresult);
11721
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011722 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011723 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011724 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 return result;
11727}
11728
INADA Naoki3ae20562017-01-16 20:41:20 +090011729/*[clinic input]
11730str.encode as unicode_encode
11731
11732 encoding: str(c_default="NULL") = 'utf-8'
11733 The encoding in which to encode the string.
11734 errors: str(c_default="NULL") = 'strict'
11735 The error handling scheme to use for encoding errors.
11736 The default is 'strict' meaning that encoding errors raise a
11737 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11738 'xmlcharrefreplace' as well as any other name registered with
11739 codecs.register_error that can handle UnicodeEncodeErrors.
11740
11741Encode the string using the codec registered for encoding.
11742[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
11744static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011745unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011746/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011748 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011749}
11750
INADA Naoki3ae20562017-01-16 20:41:20 +090011751/*[clinic input]
11752str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
INADA Naoki3ae20562017-01-16 20:41:20 +090011754 tabsize: int = 8
11755
11756Return a copy where all tab characters are expanded using spaces.
11757
11758If tabsize is not given, a tab size of 8 characters is assumed.
11759[clinic start generated code]*/
11760
11761static PyObject *
11762unicode_expandtabs_impl(PyObject *self, int tabsize)
11763/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011765 Py_ssize_t i, j, line_pos, src_len, incr;
11766 Py_UCS4 ch;
11767 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011768 const void *src_data;
11769 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011770 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011771 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Antoine Pitrou22425222011-10-04 19:10:51 +020011773 if (PyUnicode_READY(self) == -1)
11774 return NULL;
11775
Thomas Wouters7e474022000-07-16 12:04:32 +000011776 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011777 src_len = PyUnicode_GET_LENGTH(self);
11778 i = j = line_pos = 0;
11779 kind = PyUnicode_KIND(self);
11780 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011781 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011782 for (; i < src_len; i++) {
11783 ch = PyUnicode_READ(kind, src_data, i);
11784 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011785 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011787 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011789 goto overflow;
11790 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011792 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011796 goto overflow;
11797 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011799 if (ch == '\n' || ch == '\r')
11800 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011802 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011803 if (!found)
11804 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011807 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808 if (!u)
11809 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011810 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
Antoine Pitroue71d5742011-10-04 15:55:09 +020011812 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
Antoine Pitroue71d5742011-10-04 15:55:09 +020011814 for (; i < src_len; i++) {
11815 ch = PyUnicode_READ(kind, src_data, i);
11816 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011818 incr = tabsize - (line_pos % tabsize);
11819 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011820 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011821 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011823 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011825 line_pos++;
11826 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011827 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011828 if (ch == '\n' || ch == '\r')
11829 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011831 }
11832 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011833 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011834
Antoine Pitroue71d5742011-10-04 15:55:09 +020011835 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011836 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838}
11839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011840PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842\n\
11843Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011844such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845arguments start and end are interpreted as in slice notation.\n\
11846\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011847Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848
11849static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011852 /* initialize variables to prevent gcc warning */
11853 PyObject *substring = NULL;
11854 Py_ssize_t start = 0;
11855 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011856 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011858 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011861 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011864 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 if (result == -2)
11867 return NULL;
11868
Christian Heimes217cfd12007-12-02 14:31:20 +000011869 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870}
11871
11872static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011873unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011875 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011876 enum PyUnicode_Kind kind;
11877 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011878
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011879 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011880 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011882 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011883 if (PyUnicode_READY(self) == -1) {
11884 return NULL;
11885 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011886 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11887 PyErr_SetString(PyExc_IndexError, "string index out of range");
11888 return NULL;
11889 }
11890 kind = PyUnicode_KIND(self);
11891 data = PyUnicode_DATA(self);
11892 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011893 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
Guido van Rossumc2504932007-09-18 19:42:40 +000011896/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011897 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011898static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011899unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011901 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011902
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011903#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011904 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011905#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 if (_PyUnicode_HASH(self) != -1)
11907 return _PyUnicode_HASH(self);
11908 if (PyUnicode_READY(self) == -1)
11909 return -1;
animalizea1d14252019-01-02 20:16:06 +080011910
Christian Heimes985ecdc2013-11-20 11:46:18 +010011911 x = _Py_HashBytes(PyUnicode_DATA(self),
11912 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011914 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915}
11916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011917PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919\n\
oldkaa0735f2018-02-02 16:52:55 +080011920Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011921such that sub is contained within S[start:end]. Optional\n\
11922arguments start and end are interpreted as in slice notation.\n\
11923\n\
11924Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925
11926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011929 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011930 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011931 PyObject *substring = NULL;
11932 Py_ssize_t start = 0;
11933 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011935 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011938 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011941 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (result == -2)
11944 return NULL;
11945
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 if (result < 0) {
11947 PyErr_SetString(PyExc_ValueError, "substring not found");
11948 return NULL;
11949 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011950
Christian Heimes217cfd12007-12-02 14:31:20 +000011951 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952}
11953
INADA Naoki3ae20562017-01-16 20:41:20 +090011954/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011955str.isascii as unicode_isascii
11956
11957Return True if all characters in the string are ASCII, False otherwise.
11958
11959ASCII characters have code points in the range U+0000-U+007F.
11960Empty string is ASCII too.
11961[clinic start generated code]*/
11962
11963static PyObject *
11964unicode_isascii_impl(PyObject *self)
11965/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11966{
11967 if (PyUnicode_READY(self) == -1) {
11968 return NULL;
11969 }
11970 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11971}
11972
11973/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011974str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
INADA Naoki3ae20562017-01-16 20:41:20 +090011976Return True if the string is a lowercase string, False otherwise.
11977
11978A string is lowercase if all cased characters in the string are lowercase and
11979there is at least one cased character in the string.
11980[clinic start generated code]*/
11981
11982static PyObject *
11983unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011984/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 Py_ssize_t i, length;
11987 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011988 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 int cased;
11990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 if (PyUnicode_READY(self) == -1)
11992 return NULL;
11993 length = PyUnicode_GET_LENGTH(self);
11994 kind = PyUnicode_KIND(self);
11995 data = PyUnicode_DATA(self);
11996
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 if (length == 1)
11999 return PyBool_FromLong(
12000 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012002 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012004 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012005
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 for (i = 0; i < length; i++) {
12008 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012009
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012011 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012012 else if (!cased && Py_UNICODE_ISLOWER(ch))
12013 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012015 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016}
12017
INADA Naoki3ae20562017-01-16 20:41:20 +090012018/*[clinic input]
12019str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020
INADA Naoki3ae20562017-01-16 20:41:20 +090012021Return True if the string is an uppercase string, False otherwise.
12022
12023A string is uppercase if all cased characters in the string are uppercase and
12024there is at least one cased character in the string.
12025[clinic start generated code]*/
12026
12027static PyObject *
12028unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012029/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 Py_ssize_t i, length;
12032 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012033 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034 int cased;
12035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (PyUnicode_READY(self) == -1)
12037 return NULL;
12038 length = PyUnicode_GET_LENGTH(self);
12039 kind = PyUnicode_KIND(self);
12040 data = PyUnicode_DATA(self);
12041
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 if (length == 1)
12044 return PyBool_FromLong(
12045 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012047 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012049 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012050
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 for (i = 0; i < length; i++) {
12053 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012054
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012056 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 else if (!cased && Py_UNICODE_ISUPPER(ch))
12058 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012060 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061}
12062
INADA Naoki3ae20562017-01-16 20:41:20 +090012063/*[clinic input]
12064str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
INADA Naoki3ae20562017-01-16 20:41:20 +090012066Return True if the string is a title-cased string, False otherwise.
12067
12068In a title-cased string, upper- and title-case characters may only
12069follow uncased characters and lowercase characters only cased ones.
12070[clinic start generated code]*/
12071
12072static PyObject *
12073unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012074/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 Py_ssize_t i, length;
12077 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012078 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079 int cased, previous_is_cased;
12080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 if (PyUnicode_READY(self) == -1)
12082 return NULL;
12083 length = PyUnicode_GET_LENGTH(self);
12084 kind = PyUnicode_KIND(self);
12085 data = PyUnicode_DATA(self);
12086
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 if (length == 1) {
12089 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12090 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12091 (Py_UNICODE_ISUPPER(ch) != 0));
12092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012094 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012096 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012097
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 cased = 0;
12099 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 for (i = 0; i < length; i++) {
12101 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012102
Benjamin Peterson29060642009-01-31 22:14:21 +000012103 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12104 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012105 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 previous_is_cased = 1;
12107 cased = 1;
12108 }
12109 else if (Py_UNICODE_ISLOWER(ch)) {
12110 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012111 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 previous_is_cased = 1;
12113 cased = 1;
12114 }
12115 else
12116 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012118 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119}
12120
INADA Naoki3ae20562017-01-16 20:41:20 +090012121/*[clinic input]
12122str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
INADA Naoki3ae20562017-01-16 20:41:20 +090012124Return True if the string is a whitespace string, False otherwise.
12125
12126A string is whitespace if all characters in the string are whitespace and there
12127is at least one character in the string.
12128[clinic start generated code]*/
12129
12130static PyObject *
12131unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012132/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 Py_ssize_t i, length;
12135 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012136 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137
12138 if (PyUnicode_READY(self) == -1)
12139 return NULL;
12140 length = PyUnicode_GET_LENGTH(self);
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (length == 1)
12146 return PyBool_FromLong(
12147 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012149 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012151 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 for (i = 0; i < length; i++) {
12154 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012155 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012158 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159}
12160
INADA Naoki3ae20562017-01-16 20:41:20 +090012161/*[clinic input]
12162str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164Return True if the string is an alphabetic string, False otherwise.
12165
12166A string is alphabetic if all characters in the string are alphabetic and there
12167is at least one character in the string.
12168[clinic start generated code]*/
12169
12170static PyObject *
12171unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012172/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 Py_ssize_t i, length;
12175 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012176 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177
12178 if (PyUnicode_READY(self) == -1)
12179 return NULL;
12180 length = PyUnicode_GET_LENGTH(self);
12181 kind = PyUnicode_KIND(self);
12182 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012183
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (length == 1)
12186 return PyBool_FromLong(
12187 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012188
12189 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012191 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 for (i = 0; i < length; i++) {
12194 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012195 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012196 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012197 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012198}
12199
INADA Naoki3ae20562017-01-16 20:41:20 +090012200/*[clinic input]
12201str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012202
INADA Naoki3ae20562017-01-16 20:41:20 +090012203Return True if the string is an alpha-numeric string, False otherwise.
12204
12205A string is alpha-numeric if all characters in the string are alpha-numeric and
12206there is at least one character in the string.
12207[clinic start generated code]*/
12208
12209static PyObject *
12210unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012211/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012214 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 Py_ssize_t len, i;
12216
12217 if (PyUnicode_READY(self) == -1)
12218 return NULL;
12219
12220 kind = PyUnicode_KIND(self);
12221 data = PyUnicode_DATA(self);
12222 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012223
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012224 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (len == 1) {
12226 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12227 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12228 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012229
12230 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012232 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 for (i = 0; i < len; i++) {
12235 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012236 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012237 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012238 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012239 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012240}
12241
INADA Naoki3ae20562017-01-16 20:41:20 +090012242/*[clinic input]
12243str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
INADA Naoki3ae20562017-01-16 20:41:20 +090012245Return True if the string is a decimal string, False otherwise.
12246
12247A string is a decimal string if all characters in the string are decimal and
12248there is at least one character in the string.
12249[clinic start generated code]*/
12250
12251static PyObject *
12252unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012253/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 Py_ssize_t i, length;
12256 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012257 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258
12259 if (PyUnicode_READY(self) == -1)
12260 return NULL;
12261 length = PyUnicode_GET_LENGTH(self);
12262 kind = PyUnicode_KIND(self);
12263 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 if (length == 1)
12267 return PyBool_FromLong(
12268 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012270 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 for (i = 0; i < length; i++) {
12275 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012276 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012278 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281/*[clinic input]
12282str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283
INADA Naoki3ae20562017-01-16 20:41:20 +090012284Return True if the string is a digit string, False otherwise.
12285
12286A string is a digit string if all characters in the string are digits and there
12287is at least one character in the string.
12288[clinic start generated code]*/
12289
12290static PyObject *
12291unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012292/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 Py_ssize_t i, length;
12295 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012296 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297
12298 if (PyUnicode_READY(self) == -1)
12299 return NULL;
12300 length = PyUnicode_GET_LENGTH(self);
12301 kind = PyUnicode_KIND(self);
12302 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 if (length == 1) {
12306 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12307 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012310 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012312 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 for (i = 0; i < length; i++) {
12315 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012316 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012318 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319}
12320
INADA Naoki3ae20562017-01-16 20:41:20 +090012321/*[clinic input]
12322str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323
INADA Naoki3ae20562017-01-16 20:41:20 +090012324Return True if the string is a numeric string, False otherwise.
12325
12326A string is numeric if all characters in the string are numeric and there is at
12327least one character in the string.
12328[clinic start generated code]*/
12329
12330static PyObject *
12331unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012332/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 Py_ssize_t i, length;
12335 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012336 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337
12338 if (PyUnicode_READY(self) == -1)
12339 return NULL;
12340 length = PyUnicode_GET_LENGTH(self);
12341 kind = PyUnicode_KIND(self);
12342 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 if (length == 1)
12346 return PyBool_FromLong(
12347 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012349 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012351 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 for (i = 0; i < length; i++) {
12354 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012355 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012357 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358}
12359
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012360Py_ssize_t
12361_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012364 if (PyUnicode_READY(self) == -1)
12365 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012366
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012367 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012368 if (len == 0) {
12369 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 }
12372
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012373 int kind = PyUnicode_KIND(self);
12374 const void *data = PyUnicode_DATA(self);
12375 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012376 /* PEP 3131 says that the first character must be in
12377 XID_Start and subsequent characters in XID_Continue,
12378 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012379 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012380 letters, digits, underscore). However, given the current
12381 definition of XID_Start and XID_Continue, it is sufficient
12382 to check just for these, except that _ must be allowed
12383 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012384 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012385 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012386 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012387
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012388 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012390 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012391 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012392 }
12393 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012394 return i;
12395}
12396
12397int
12398PyUnicode_IsIdentifier(PyObject *self)
12399{
12400 if (PyUnicode_IS_READY(self)) {
12401 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12402 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12403 /* an empty string is not a valid identifier */
12404 return len && i == len;
12405 }
12406 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012407_Py_COMP_DIAG_PUSH
12408_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012409 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012410 if (len == 0) {
12411 /* an empty string is not a valid identifier */
12412 return 0;
12413 }
12414
12415 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012416 Py_UCS4 ch = wstr[i++];
12417#if SIZEOF_WCHAR_T == 2
12418 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12419 && i < len
12420 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12421 {
12422 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12423 i++;
12424 }
12425#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012426 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12427 return 0;
12428 }
12429
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012430 while (i < len) {
12431 ch = wstr[i++];
12432#if SIZEOF_WCHAR_T == 2
12433 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12434 && i < len
12435 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12436 {
12437 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12438 i++;
12439 }
12440#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012441 if (!_PyUnicode_IsXidContinue(ch)) {
12442 return 0;
12443 }
12444 }
12445 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012446_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012447 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012448}
12449
INADA Naoki3ae20562017-01-16 20:41:20 +090012450/*[clinic input]
12451str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012452
INADA Naoki3ae20562017-01-16 20:41:20 +090012453Return True if the string is a valid Python identifier, False otherwise.
12454
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012455Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012456such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012457[clinic start generated code]*/
12458
12459static PyObject *
12460unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012461/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012462{
12463 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12464}
12465
INADA Naoki3ae20562017-01-16 20:41:20 +090012466/*[clinic input]
12467str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012468
INADA Naoki3ae20562017-01-16 20:41:20 +090012469Return True if the string is printable, False otherwise.
12470
12471A string is printable if all of its characters are considered printable in
12472repr() or if it is empty.
12473[clinic start generated code]*/
12474
12475static PyObject *
12476unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012477/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 Py_ssize_t i, length;
12480 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012481 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482
12483 if (PyUnicode_READY(self) == -1)
12484 return NULL;
12485 length = PyUnicode_GET_LENGTH(self);
12486 kind = PyUnicode_KIND(self);
12487 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012488
12489 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 if (length == 1)
12491 return PyBool_FromLong(
12492 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 for (i = 0; i < length; i++) {
12495 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012496 Py_RETURN_FALSE;
12497 }
12498 }
12499 Py_RETURN_TRUE;
12500}
12501
INADA Naoki3ae20562017-01-16 20:41:20 +090012502/*[clinic input]
12503str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504
INADA Naoki3ae20562017-01-16 20:41:20 +090012505 iterable: object
12506 /
12507
12508Concatenate any number of strings.
12509
Martin Panter91a88662017-01-24 00:30:06 +000012510The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012511The result is returned as a new string.
12512
12513Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12514[clinic start generated code]*/
12515
12516static PyObject *
12517unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012518/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519{
INADA Naoki3ae20562017-01-16 20:41:20 +090012520 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521}
12522
Martin v. Löwis18e16552006-02-15 17:27:45 +000012523static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012524unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 if (PyUnicode_READY(self) == -1)
12527 return -1;
12528 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529}
12530
INADA Naoki3ae20562017-01-16 20:41:20 +090012531/*[clinic input]
12532str.ljust as unicode_ljust
12533
12534 width: Py_ssize_t
12535 fillchar: Py_UCS4 = ' '
12536 /
12537
12538Return a left-justified string of length width.
12539
12540Padding is done using the specified fill character (default is a space).
12541[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
12543static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012544unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12545/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012547 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549
Victor Stinnerc4b49542011-12-11 22:44:26 +010012550 if (PyUnicode_GET_LENGTH(self) >= width)
12551 return unicode_result_unchanged(self);
12552
12553 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554}
12555
INADA Naoki3ae20562017-01-16 20:41:20 +090012556/*[clinic input]
12557str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558
INADA Naoki3ae20562017-01-16 20:41:20 +090012559Return a copy of the string converted to lowercase.
12560[clinic start generated code]*/
12561
12562static PyObject *
12563unicode_lower_impl(PyObject *self)
12564/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012566 if (PyUnicode_READY(self) == -1)
12567 return NULL;
12568 if (PyUnicode_IS_ASCII(self))
12569 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012570 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571}
12572
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012573#define LEFTSTRIP 0
12574#define RIGHTSTRIP 1
12575#define BOTHSTRIP 2
12576
12577/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012578static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012579
INADA Naoki3ae20562017-01-16 20:41:20 +090012580#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012581
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012582/* externally visible for str.strip(unicode) */
12583PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012584_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012585{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012586 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 int kind;
12588 Py_ssize_t i, j, len;
12589 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012590 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012592 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12593 return NULL;
12594
12595 kind = PyUnicode_KIND(self);
12596 data = PyUnicode_DATA(self);
12597 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012598 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12600 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012601 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012602
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 i = 0;
12604 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012605 while (i < len) {
12606 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12607 if (!BLOOM(sepmask, ch))
12608 break;
12609 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12610 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 i++;
12612 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012613 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012614
Benjamin Peterson14339b62009-01-31 16:36:08 +000012615 j = len;
12616 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012617 j--;
12618 while (j >= i) {
12619 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12620 if (!BLOOM(sepmask, ch))
12621 break;
12622 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12623 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012625 }
12626
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012628 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012629
Victor Stinner7931d9a2011-11-04 00:22:48 +010012630 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631}
12632
12633PyObject*
12634PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12635{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012636 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012638 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639
Victor Stinnerde636f32011-10-01 03:55:54 +020012640 if (PyUnicode_READY(self) == -1)
12641 return NULL;
12642
Victor Stinner684d5fd2012-05-03 02:32:34 +020012643 length = PyUnicode_GET_LENGTH(self);
12644 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012645
Victor Stinner684d5fd2012-05-03 02:32:34 +020012646 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012647 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648
Victor Stinnerde636f32011-10-01 03:55:54 +020012649 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012650 PyErr_SetString(PyExc_IndexError, "string index out of range");
12651 return NULL;
12652 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012653 if (start >= length || end < start)
12654 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012655
Victor Stinner684d5fd2012-05-03 02:32:34 +020012656 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012657 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012658 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012659 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012660 }
12661 else {
12662 kind = PyUnicode_KIND(self);
12663 data = PyUnicode_1BYTE_DATA(self);
12664 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012665 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012666 length);
12667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
12670static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012671do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 Py_ssize_t len, i, j;
12674
12675 if (PyUnicode_READY(self) == -1)
12676 return NULL;
12677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012679
Victor Stinnercc7af722013-04-09 22:39:24 +020012680 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012681 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012682
12683 i = 0;
12684 if (striptype != RIGHTSTRIP) {
12685 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012686 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012687 if (!_Py_ascii_whitespace[ch])
12688 break;
12689 i++;
12690 }
12691 }
12692
12693 j = len;
12694 if (striptype != LEFTSTRIP) {
12695 j--;
12696 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012697 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012698 if (!_Py_ascii_whitespace[ch])
12699 break;
12700 j--;
12701 }
12702 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 }
12704 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012705 else {
12706 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012707 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012708
Victor Stinnercc7af722013-04-09 22:39:24 +020012709 i = 0;
12710 if (striptype != RIGHTSTRIP) {
12711 while (i < len) {
12712 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12713 if (!Py_UNICODE_ISSPACE(ch))
12714 break;
12715 i++;
12716 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012717 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012718
12719 j = len;
12720 if (striptype != LEFTSTRIP) {
12721 j--;
12722 while (j >= i) {
12723 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12724 if (!Py_UNICODE_ISSPACE(ch))
12725 break;
12726 j--;
12727 }
12728 j++;
12729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012730 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012731
Victor Stinner7931d9a2011-11-04 00:22:48 +010012732 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733}
12734
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012735
12736static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012737do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012738{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012739 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012740 if (PyUnicode_Check(sep))
12741 return _PyUnicode_XStrip(self, striptype, sep);
12742 else {
12743 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 "%s arg must be None or str",
12745 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 return NULL;
12747 }
12748 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012749
Benjamin Peterson14339b62009-01-31 16:36:08 +000012750 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012751}
12752
12753
INADA Naoki3ae20562017-01-16 20:41:20 +090012754/*[clinic input]
12755str.strip as unicode_strip
12756
12757 chars: object = None
12758 /
12759
Zachary Ware09895c22019-10-09 16:09:00 -050012760Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012761
12762If chars is given and not None, remove characters in chars instead.
12763[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012764
12765static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012766unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012767/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012768{
INADA Naoki3ae20562017-01-16 20:41:20 +090012769 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012770}
12771
12772
INADA Naoki3ae20562017-01-16 20:41:20 +090012773/*[clinic input]
12774str.lstrip as unicode_lstrip
12775
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012776 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012777 /
12778
12779Return a copy of the string with leading whitespace removed.
12780
12781If chars is given and not None, remove characters in chars instead.
12782[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012783
12784static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012785unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012786/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012787{
INADA Naoki3ae20562017-01-16 20:41:20 +090012788 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012789}
12790
12791
INADA Naoki3ae20562017-01-16 20:41:20 +090012792/*[clinic input]
12793str.rstrip as unicode_rstrip
12794
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012795 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012796 /
12797
12798Return a copy of the string with trailing whitespace removed.
12799
12800If chars is given and not None, remove characters in chars instead.
12801[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012802
12803static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012804unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012805/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012806{
INADA Naoki3ae20562017-01-16 20:41:20 +090012807 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012808}
12809
12810
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012812unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012814 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
Serhiy Storchaka05997252013-01-26 12:14:02 +020012817 if (len < 1)
12818 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Victor Stinnerc4b49542011-12-11 22:44:26 +010012820 /* no repeat, return original string */
12821 if (len == 1)
12822 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012823
Benjamin Petersonbac79492012-01-14 13:34:47 -050012824 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 return NULL;
12826
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012827 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012828 PyErr_SetString(PyExc_OverflowError,
12829 "repeated string is too long");
12830 return NULL;
12831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012833
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012834 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 if (!u)
12836 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012837 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012840 int kind = PyUnicode_KIND(str);
12841 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012842 if (kind == PyUnicode_1BYTE_KIND) {
12843 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012844 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012845 }
12846 else if (kind == PyUnicode_2BYTE_KIND) {
12847 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012848 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012849 ucs2[n] = fill_char;
12850 } else {
12851 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12852 assert(kind == PyUnicode_4BYTE_KIND);
12853 for (n = 0; n < len; ++n)
12854 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 }
12857 else {
12858 /* number of characters copied this far */
12859 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012860 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012862 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012866 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012867 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869 }
12870
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012871 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012872 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873}
12874
Alexander Belopolsky40018472011-02-26 01:02:56 +000012875PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012876PyUnicode_Replace(PyObject *str,
12877 PyObject *substr,
12878 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012879 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012881 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12882 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012883 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012884 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
INADA Naoki3ae20562017-01-16 20:41:20 +090012887/*[clinic input]
12888str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889
INADA Naoki3ae20562017-01-16 20:41:20 +090012890 old: unicode
12891 new: unicode
12892 count: Py_ssize_t = -1
12893 Maximum number of occurrences to replace.
12894 -1 (the default value) means replace all occurrences.
12895 /
12896
12897Return a copy with all occurrences of substring old replaced by new.
12898
12899If the optional argument count is given, only the first count occurrences are
12900replaced.
12901[clinic start generated code]*/
12902
12903static PyObject *
12904unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12905 Py_ssize_t count)
12906/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012908 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012910 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911}
12912
sweeneydea81849b2020-04-22 17:05:48 -040012913/*[clinic input]
12914str.removeprefix as unicode_removeprefix
12915
12916 prefix: unicode
12917 /
12918
12919Return a str with the given prefix string removed if present.
12920
12921If the string starts with the prefix string, return string[len(prefix):].
12922Otherwise, return a copy of the original string.
12923[clinic start generated code]*/
12924
12925static PyObject *
12926unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12927/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12928{
12929 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12930 if (match == -1) {
12931 return NULL;
12932 }
12933 if (match) {
12934 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12935 PyUnicode_GET_LENGTH(self));
12936 }
12937 return unicode_result_unchanged(self);
12938}
12939
12940/*[clinic input]
12941str.removesuffix as unicode_removesuffix
12942
12943 suffix: unicode
12944 /
12945
12946Return a str with the given suffix string removed if present.
12947
12948If the string ends with the suffix string and that suffix is not empty,
12949return string[:-len(suffix)]. Otherwise, return a copy of the original
12950string.
12951[clinic start generated code]*/
12952
12953static PyObject *
12954unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12955/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12956{
12957 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12958 if (match == -1) {
12959 return NULL;
12960 }
12961 if (match) {
12962 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12963 - PyUnicode_GET_LENGTH(suffix));
12964 }
12965 return unicode_result_unchanged(self);
12966}
12967
Alexander Belopolsky40018472011-02-26 01:02:56 +000012968static PyObject *
12969unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012971 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 Py_ssize_t isize;
12973 Py_ssize_t osize, squote, dquote, i, o;
12974 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012975 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012976 const void *idata;
12977 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012980 return NULL;
12981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 isize = PyUnicode_GET_LENGTH(unicode);
12983 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 /* Compute length of output, quote characters, and
12986 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012987 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 max = 127;
12989 squote = dquote = 0;
12990 ikind = PyUnicode_KIND(unicode);
12991 for (i = 0; i < isize; i++) {
12992 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012993 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012995 case '\'': squote++; break;
12996 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012998 incr = 2;
12999 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 default:
13001 /* Fast-path ASCII */
13002 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013003 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013005 ;
13006 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013009 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013011 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013013 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013015 if (osize > PY_SSIZE_T_MAX - incr) {
13016 PyErr_SetString(PyExc_OverflowError,
13017 "string is too long to generate repr");
13018 return NULL;
13019 }
13020 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 }
13022
13023 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013024 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013026 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 if (dquote)
13028 /* Both squote and dquote present. Use squote,
13029 and escape them */
13030 osize += squote;
13031 else
13032 quote = '"';
13033 }
Victor Stinner55c08782013-04-14 18:45:39 +020013034 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035
13036 repr = PyUnicode_New(osize, max);
13037 if (repr == NULL)
13038 return NULL;
13039 okind = PyUnicode_KIND(repr);
13040 odata = PyUnicode_DATA(repr);
13041
13042 PyUnicode_WRITE(okind, odata, 0, quote);
13043 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013044 if (unchanged) {
13045 _PyUnicode_FastCopyCharacters(repr, 1,
13046 unicode, 0,
13047 isize);
13048 }
13049 else {
13050 for (i = 0, o = 1; i < isize; i++) {
13051 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013052
Victor Stinner55c08782013-04-14 18:45:39 +020013053 /* Escape quotes and backslashes */
13054 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013055 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013057 continue;
13058 }
13059
13060 /* Map special whitespace to '\t', \n', '\r' */
13061 if (ch == '\t') {
13062 PyUnicode_WRITE(okind, odata, o++, '\\');
13063 PyUnicode_WRITE(okind, odata, o++, 't');
13064 }
13065 else if (ch == '\n') {
13066 PyUnicode_WRITE(okind, odata, o++, '\\');
13067 PyUnicode_WRITE(okind, odata, o++, 'n');
13068 }
13069 else if (ch == '\r') {
13070 PyUnicode_WRITE(okind, odata, o++, '\\');
13071 PyUnicode_WRITE(okind, odata, o++, 'r');
13072 }
13073
13074 /* Map non-printable US ASCII to '\xhh' */
13075 else if (ch < ' ' || ch == 0x7F) {
13076 PyUnicode_WRITE(okind, odata, o++, '\\');
13077 PyUnicode_WRITE(okind, odata, o++, 'x');
13078 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13079 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13080 }
13081
13082 /* Copy ASCII characters as-is */
13083 else if (ch < 0x7F) {
13084 PyUnicode_WRITE(okind, odata, o++, ch);
13085 }
13086
13087 /* Non-ASCII characters */
13088 else {
13089 /* Map Unicode whitespace and control characters
13090 (categories Z* and C* except ASCII space)
13091 */
13092 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13093 PyUnicode_WRITE(okind, odata, o++, '\\');
13094 /* Map 8-bit characters to '\xhh' */
13095 if (ch <= 0xff) {
13096 PyUnicode_WRITE(okind, odata, o++, 'x');
13097 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13098 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13099 }
13100 /* Map 16-bit characters to '\uxxxx' */
13101 else if (ch <= 0xffff) {
13102 PyUnicode_WRITE(okind, odata, o++, 'u');
13103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13105 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13106 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13107 }
13108 /* Map 21-bit characters to '\U00xxxxxx' */
13109 else {
13110 PyUnicode_WRITE(okind, odata, o++, 'U');
13111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13119 }
13120 }
13121 /* Copy characters as-is */
13122 else {
13123 PyUnicode_WRITE(okind, odata, o++, ch);
13124 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013125 }
13126 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013129 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013130 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131}
13132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013133PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135\n\
13136Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013137such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138arguments start and end are interpreted as in slice notation.\n\
13139\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013140Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
13142static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013145 /* initialize variables to prevent gcc warning */
13146 PyObject *substring = NULL;
13147 Py_ssize_t start = 0;
13148 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013149 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013151 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013154 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013157 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 if (result == -2)
13160 return NULL;
13161
Christian Heimes217cfd12007-12-02 14:31:20 +000013162 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163}
13164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013165PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013168Return the highest index in S where substring sub is found,\n\
13169such that sub is contained within S[start:end]. Optional\n\
13170arguments start and end are interpreted as in slice notation.\n\
13171\n\
13172Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173
13174static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013177 /* initialize variables to prevent gcc warning */
13178 PyObject *substring = NULL;
13179 Py_ssize_t start = 0;
13180 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013181 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013183 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013186 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013189 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 if (result == -2)
13192 return NULL;
13193
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194 if (result < 0) {
13195 PyErr_SetString(PyExc_ValueError, "substring not found");
13196 return NULL;
13197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198
Christian Heimes217cfd12007-12-02 14:31:20 +000013199 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200}
13201
INADA Naoki3ae20562017-01-16 20:41:20 +090013202/*[clinic input]
13203str.rjust as unicode_rjust
13204
13205 width: Py_ssize_t
13206 fillchar: Py_UCS4 = ' '
13207 /
13208
13209Return a right-justified string of length width.
13210
13211Padding is done using the specified fill character (default is a space).
13212[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213
13214static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013215unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13216/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013218 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219 return NULL;
13220
Victor Stinnerc4b49542011-12-11 22:44:26 +010013221 if (PyUnicode_GET_LENGTH(self) >= width)
13222 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223
Victor Stinnerc4b49542011-12-11 22:44:26 +010013224 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013225}
13226
Alexander Belopolsky40018472011-02-26 01:02:56 +000013227PyObject *
13228PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013230 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013233 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234}
13235
INADA Naoki3ae20562017-01-16 20:41:20 +090013236/*[clinic input]
13237str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238
INADA Naoki3ae20562017-01-16 20:41:20 +090013239 sep: object = None
13240 The delimiter according which to split the string.
13241 None (the default value) means split according to any whitespace,
13242 and discard empty strings from the result.
13243 maxsplit: Py_ssize_t = -1
13244 Maximum number of splits to do.
13245 -1 (the default value) means no limit.
13246
13247Return a list of the words in the string, using sep as the delimiter string.
13248[clinic start generated code]*/
13249
13250static PyObject *
13251unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13252/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253{
INADA Naoki3ae20562017-01-16 20:41:20 +090013254 if (sep == Py_None)
13255 return split(self, NULL, maxsplit);
13256 if (PyUnicode_Check(sep))
13257 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013258
Victor Stinner998b8062018-09-12 00:23:25 +020013259 PyErr_Format(PyExc_TypeError,
13260 "must be str or None, not %.100s",
13261 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263}
13264
Thomas Wouters477c8d52006-05-27 19:21:47 +000013265PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013266PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013267{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013268 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013269 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013270 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013272
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013273 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013275
Victor Stinner14f8f022011-10-05 20:58:25 +020013276 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 len1 = PyUnicode_GET_LENGTH(str_obj);
13279 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013280 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013281 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013282 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013283 }
13284 buf1 = PyUnicode_DATA(str_obj);
13285 buf2 = PyUnicode_DATA(sep_obj);
13286 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013287 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013288 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013289 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013292 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013294 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13295 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13296 else
13297 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 break;
13299 case PyUnicode_2BYTE_KIND:
13300 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13301 break;
13302 case PyUnicode_4BYTE_KIND:
13303 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13304 break;
13305 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013306 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013308
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013309 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013310 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013311 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013312
13313 return out;
13314}
13315
13316
13317PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013318PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013319{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013320 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013321 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013322 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013324
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013325 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013326 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013327
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013328 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 len1 = PyUnicode_GET_LENGTH(str_obj);
13331 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013332 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013333 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013334 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013335 }
13336 buf1 = PyUnicode_DATA(str_obj);
13337 buf2 = PyUnicode_DATA(sep_obj);
13338 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013339 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013340 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013341 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013343
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013344 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013345 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013346 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13347 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13348 else
13349 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 break;
13351 case PyUnicode_2BYTE_KIND:
13352 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13353 break;
13354 case PyUnicode_4BYTE_KIND:
13355 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13356 break;
13357 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013358 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013359 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013360
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013361 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013362 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013363 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013364
13365 return out;
13366}
13367
INADA Naoki3ae20562017-01-16 20:41:20 +090013368/*[clinic input]
13369str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013370
INADA Naoki3ae20562017-01-16 20:41:20 +090013371 sep: object
13372 /
13373
13374Partition the string into three parts using the given separator.
13375
13376This will search for the separator in the string. If the separator is found,
13377returns a 3-tuple containing the part before the separator, the separator
13378itself, and the part after it.
13379
13380If the separator is not found, returns a 3-tuple containing the original string
13381and two empty strings.
13382[clinic start generated code]*/
13383
13384static PyObject *
13385unicode_partition(PyObject *self, PyObject *sep)
13386/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013387{
INADA Naoki3ae20562017-01-16 20:41:20 +090013388 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013389}
13390
INADA Naoki3ae20562017-01-16 20:41:20 +090013391/*[clinic input]
13392str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013393
INADA Naoki3ae20562017-01-16 20:41:20 +090013394Partition the string into three parts using the given separator.
13395
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013396This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013397the separator is found, returns a 3-tuple containing the part before the
13398separator, the separator itself, and the part after it.
13399
13400If the separator is not found, returns a 3-tuple containing two empty strings
13401and the original string.
13402[clinic start generated code]*/
13403
13404static PyObject *
13405unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013406/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013407{
INADA Naoki3ae20562017-01-16 20:41:20 +090013408 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013409}
13410
Alexander Belopolsky40018472011-02-26 01:02:56 +000013411PyObject *
13412PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013413{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013414 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013416
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013417 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013418}
13419
INADA Naoki3ae20562017-01-16 20:41:20 +090013420/*[clinic input]
13421str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013422
INADA Naoki3ae20562017-01-16 20:41:20 +090013423Return a list of the words in the string, using sep as the delimiter string.
13424
13425Splits are done starting at the end of the string and working to the front.
13426[clinic start generated code]*/
13427
13428static PyObject *
13429unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13430/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013431{
INADA Naoki3ae20562017-01-16 20:41:20 +090013432 if (sep == Py_None)
13433 return rsplit(self, NULL, maxsplit);
13434 if (PyUnicode_Check(sep))
13435 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013436
Victor Stinner998b8062018-09-12 00:23:25 +020013437 PyErr_Format(PyExc_TypeError,
13438 "must be str or None, not %.100s",
13439 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013440 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013441}
13442
INADA Naoki3ae20562017-01-16 20:41:20 +090013443/*[clinic input]
13444str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013446 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013447
13448Return a list of the lines in the string, breaking at line boundaries.
13449
13450Line breaks are not included in the resulting list unless keepends is given and
13451true.
13452[clinic start generated code]*/
13453
13454static PyObject *
13455unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013456/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013458 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459}
13460
13461static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013462PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013464 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465}
13466
INADA Naoki3ae20562017-01-16 20:41:20 +090013467/*[clinic input]
13468str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013469
INADA Naoki3ae20562017-01-16 20:41:20 +090013470Convert uppercase characters to lowercase and lowercase characters to uppercase.
13471[clinic start generated code]*/
13472
13473static PyObject *
13474unicode_swapcase_impl(PyObject *self)
13475/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013477 if (PyUnicode_READY(self) == -1)
13478 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013479 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013480}
13481
Larry Hastings61272b72014-01-07 12:41:53 -080013482/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013483
Larry Hastings31826802013-10-19 00:09:25 -070013484@staticmethod
13485str.maketrans as unicode_maketrans
13486
13487 x: object
13488
13489 y: unicode=NULL
13490
13491 z: unicode=NULL
13492
13493 /
13494
13495Return a translation table usable for str.translate().
13496
13497If there is only one argument, it must be a dictionary mapping Unicode
13498ordinals (integers) or characters to Unicode ordinals, strings or None.
13499Character keys will be then converted to ordinals.
13500If there are two arguments, they must be strings of equal length, and
13501in the resulting dictionary, each character in x will be mapped to the
13502character at the same position in y. If there is a third argument, it
13503must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013504[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013505
Larry Hastings31826802013-10-19 00:09:25 -070013506static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013507unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013508/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013509{
Georg Brandlceee0772007-11-27 23:48:05 +000013510 PyObject *new = NULL, *key, *value;
13511 Py_ssize_t i = 0;
13512 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013513
Georg Brandlceee0772007-11-27 23:48:05 +000013514 new = PyDict_New();
13515 if (!new)
13516 return NULL;
13517 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013519 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520
Georg Brandlceee0772007-11-27 23:48:05 +000013521 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013522 if (!PyUnicode_Check(x)) {
13523 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13524 "be a string if there is a second argument");
13525 goto err;
13526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013527 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013528 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13529 "arguments must have equal length");
13530 goto err;
13531 }
13532 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013533 x_kind = PyUnicode_KIND(x);
13534 y_kind = PyUnicode_KIND(y);
13535 x_data = PyUnicode_DATA(x);
13536 y_data = PyUnicode_DATA(y);
13537 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13538 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013539 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013540 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013541 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013542 if (!value) {
13543 Py_DECREF(key);
13544 goto err;
13545 }
Georg Brandlceee0772007-11-27 23:48:05 +000013546 res = PyDict_SetItem(new, key, value);
13547 Py_DECREF(key);
13548 Py_DECREF(value);
13549 if (res < 0)
13550 goto err;
13551 }
13552 /* create entries for deleting chars in z */
13553 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013554 z_kind = PyUnicode_KIND(z);
13555 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013556 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013557 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013558 if (!key)
13559 goto err;
13560 res = PyDict_SetItem(new, key, Py_None);
13561 Py_DECREF(key);
13562 if (res < 0)
13563 goto err;
13564 }
13565 }
13566 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013568 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013569
Georg Brandlceee0772007-11-27 23:48:05 +000013570 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013571 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013572 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13573 "to maketrans it must be a dict");
13574 goto err;
13575 }
13576 /* copy entries into the new dict, converting string keys to int keys */
13577 while (PyDict_Next(x, &i, &key, &value)) {
13578 if (PyUnicode_Check(key)) {
13579 /* convert string keys to integer keys */
13580 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013581 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013582 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13583 "table must be of length 1");
13584 goto err;
13585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013586 kind = PyUnicode_KIND(key);
13587 data = PyUnicode_DATA(key);
13588 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013589 if (!newkey)
13590 goto err;
13591 res = PyDict_SetItem(new, newkey, value);
13592 Py_DECREF(newkey);
13593 if (res < 0)
13594 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013595 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013596 /* just keep integer keys */
13597 if (PyDict_SetItem(new, key, value) < 0)
13598 goto err;
13599 } else {
13600 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13601 "be strings or integers");
13602 goto err;
13603 }
13604 }
13605 }
13606 return new;
13607 err:
13608 Py_DECREF(new);
13609 return NULL;
13610}
13611
INADA Naoki3ae20562017-01-16 20:41:20 +090013612/*[clinic input]
13613str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013614
INADA Naoki3ae20562017-01-16 20:41:20 +090013615 table: object
13616 Translation table, which must be a mapping of Unicode ordinals to
13617 Unicode ordinals, strings, or None.
13618 /
13619
13620Replace each character in the string using the given translation table.
13621
13622The table must implement lookup/indexing via __getitem__, for instance a
13623dictionary or list. If this operation raises LookupError, the character is
13624left untouched. Characters mapped to None are deleted.
13625[clinic start generated code]*/
13626
13627static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013629/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013632}
13633
INADA Naoki3ae20562017-01-16 20:41:20 +090013634/*[clinic input]
13635str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013636
INADA Naoki3ae20562017-01-16 20:41:20 +090013637Return a copy of the string converted to uppercase.
13638[clinic start generated code]*/
13639
13640static PyObject *
13641unicode_upper_impl(PyObject *self)
13642/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013643{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013644 if (PyUnicode_READY(self) == -1)
13645 return NULL;
13646 if (PyUnicode_IS_ASCII(self))
13647 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013648 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013649}
13650
INADA Naoki3ae20562017-01-16 20:41:20 +090013651/*[clinic input]
13652str.zfill as unicode_zfill
13653
13654 width: Py_ssize_t
13655 /
13656
13657Pad a numeric string with zeros on the left, to fill a field of the given width.
13658
13659The string is never truncated.
13660[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661
13662static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013663unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013664/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013665{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013666 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013667 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013668 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013669 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 Py_UCS4 chr;
13671
Benjamin Petersonbac79492012-01-14 13:34:47 -050013672 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013674
Victor Stinnerc4b49542011-12-11 22:44:26 +010013675 if (PyUnicode_GET_LENGTH(self) >= width)
13676 return unicode_result_unchanged(self);
13677
13678 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679
13680 u = pad(self, fill, 0, '0');
13681
Walter Dörwald068325e2002-04-15 13:36:47 +000013682 if (u == NULL)
13683 return NULL;
13684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 kind = PyUnicode_KIND(u);
13686 data = PyUnicode_DATA(u);
13687 chr = PyUnicode_READ(kind, data, fill);
13688
13689 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013690 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013691 PyUnicode_WRITE(kind, data, 0, chr);
13692 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693 }
13694
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013695 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013696 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013697}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698
13699#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013700static PyObject *
13701unicode__decimal2ascii(PyObject *self)
13702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013703 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013704}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013705#endif
13706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013707PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013710Return True if S starts with the specified prefix, False otherwise.\n\
13711With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013712With optional end, stop comparing S at that position.\n\
13713prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013714
13715static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013716unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013718{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013719 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013720 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013721 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013722 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013723 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013724
Jesus Ceaac451502011-04-20 17:09:23 +020013725 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013727 if (PyTuple_Check(subobj)) {
13728 Py_ssize_t i;
13729 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013730 substring = PyTuple_GET_ITEM(subobj, i);
13731 if (!PyUnicode_Check(substring)) {
13732 PyErr_Format(PyExc_TypeError,
13733 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013734 "not %.100s",
13735 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013736 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013737 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013738 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013739 if (result == -1)
13740 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013741 if (result) {
13742 Py_RETURN_TRUE;
13743 }
13744 }
13745 /* nothing matched */
13746 Py_RETURN_FALSE;
13747 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013748 if (!PyUnicode_Check(subobj)) {
13749 PyErr_Format(PyExc_TypeError,
13750 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013751 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013753 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013754 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013755 if (result == -1)
13756 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013757 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013758}
13759
13760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013761PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013762 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013764Return True if S ends with the specified suffix, False otherwise.\n\
13765With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013766With optional end, stop comparing S at that position.\n\
13767suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013768
13769static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013770unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013773 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013774 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013775 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013776 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013777 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778
Jesus Ceaac451502011-04-20 17:09:23 +020013779 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013780 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013781 if (PyTuple_Check(subobj)) {
13782 Py_ssize_t i;
13783 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013784 substring = PyTuple_GET_ITEM(subobj, i);
13785 if (!PyUnicode_Check(substring)) {
13786 PyErr_Format(PyExc_TypeError,
13787 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013788 "not %.100s",
13789 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013791 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013792 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013793 if (result == -1)
13794 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013795 if (result) {
13796 Py_RETURN_TRUE;
13797 }
13798 }
13799 Py_RETURN_FALSE;
13800 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013801 if (!PyUnicode_Check(subobj)) {
13802 PyErr_Format(PyExc_TypeError,
13803 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013804 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013806 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013807 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013808 if (result == -1)
13809 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013810 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013811}
13812
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013813static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013814_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013815{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013816 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13817 writer->data = PyUnicode_DATA(writer->buffer);
13818
13819 if (!writer->readonly) {
13820 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013821 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013822 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013823 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013824 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13825 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13826 writer->kind = PyUnicode_WCHAR_KIND;
13827 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13828
Victor Stinner8f674cc2013-04-17 23:02:17 +020013829 /* Copy-on-write mode: set buffer size to 0 so
13830 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13831 * next write. */
13832 writer->size = 0;
13833 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013834}
13835
Victor Stinnerd3f08822012-05-29 12:57:52 +020013836void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013837_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013838{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013839 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013840
13841 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013842 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013843
13844 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13845 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13846 writer->kind = PyUnicode_WCHAR_KIND;
13847 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013848}
13849
Inada Naoki770847a2019-06-24 12:30:24 +090013850// Initialize _PyUnicodeWriter with initial buffer
13851static inline void
13852_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13853{
13854 memset(writer, 0, sizeof(*writer));
13855 writer->buffer = buffer;
13856 _PyUnicodeWriter_Update(writer);
13857 writer->min_length = writer->size;
13858}
13859
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860int
13861_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13862 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013863{
13864 Py_ssize_t newlen;
13865 PyObject *newbuffer;
13866
Victor Stinner2740e462016-09-06 16:58:36 -070013867 assert(maxchar <= MAX_UNICODE);
13868
Victor Stinnerca9381e2015-09-22 00:58:32 +020013869 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013870 assert((maxchar > writer->maxchar && length >= 0)
13871 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013872
Victor Stinner202fdca2012-05-07 12:47:02 +020013873 if (length > PY_SSIZE_T_MAX - writer->pos) {
13874 PyErr_NoMemory();
13875 return -1;
13876 }
13877 newlen = writer->pos + length;
13878
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013879 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013880
Victor Stinnerd3f08822012-05-29 12:57:52 +020013881 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013882 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013883 if (writer->overallocate
13884 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13885 /* overallocate to limit the number of realloc() */
13886 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013887 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013888 if (newlen < writer->min_length)
13889 newlen = writer->min_length;
13890
Victor Stinnerd3f08822012-05-29 12:57:52 +020013891 writer->buffer = PyUnicode_New(newlen, maxchar);
13892 if (writer->buffer == NULL)
13893 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013894 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013895 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013896 if (writer->overallocate
13897 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13898 /* overallocate to limit the number of realloc() */
13899 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013900 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013901 if (newlen < writer->min_length)
13902 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013903
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013904 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013905 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013906 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013907 newbuffer = PyUnicode_New(newlen, maxchar);
13908 if (newbuffer == NULL)
13909 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013910 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13911 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013912 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013913 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013914 }
13915 else {
13916 newbuffer = resize_compact(writer->buffer, newlen);
13917 if (newbuffer == NULL)
13918 return -1;
13919 }
13920 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013921 }
13922 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013923 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013924 newbuffer = PyUnicode_New(writer->size, maxchar);
13925 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013926 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013927 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13928 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013929 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013930 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013931 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013932 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013933
13934#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013935}
13936
Victor Stinnerca9381e2015-09-22 00:58:32 +020013937int
13938_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13939 enum PyUnicode_Kind kind)
13940{
13941 Py_UCS4 maxchar;
13942
13943 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13944 assert(writer->kind < kind);
13945
13946 switch (kind)
13947 {
13948 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13949 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13950 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13951 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013952 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013953 }
13954
13955 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13956}
13957
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013958static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013959_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013960{
Victor Stinner2740e462016-09-06 16:58:36 -070013961 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013962 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13963 return -1;
13964 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13965 writer->pos++;
13966 return 0;
13967}
13968
13969int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013970_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13971{
13972 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13973}
13974
13975int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013976_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13977{
13978 Py_UCS4 maxchar;
13979 Py_ssize_t len;
13980
13981 if (PyUnicode_READY(str) == -1)
13982 return -1;
13983 len = PyUnicode_GET_LENGTH(str);
13984 if (len == 0)
13985 return 0;
13986 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13987 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013988 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013989 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013990 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013991 Py_INCREF(str);
13992 writer->buffer = str;
13993 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013994 writer->pos += len;
13995 return 0;
13996 }
13997 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13998 return -1;
13999 }
14000 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14001 str, 0, len);
14002 writer->pos += len;
14003 return 0;
14004}
14005
Victor Stinnere215d962012-10-06 23:03:36 +020014006int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014007_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14008 Py_ssize_t start, Py_ssize_t end)
14009{
14010 Py_UCS4 maxchar;
14011 Py_ssize_t len;
14012
14013 if (PyUnicode_READY(str) == -1)
14014 return -1;
14015
14016 assert(0 <= start);
14017 assert(end <= PyUnicode_GET_LENGTH(str));
14018 assert(start <= end);
14019
14020 if (end == 0)
14021 return 0;
14022
14023 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14024 return _PyUnicodeWriter_WriteStr(writer, str);
14025
14026 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14027 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14028 else
14029 maxchar = writer->maxchar;
14030 len = end - start;
14031
14032 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14033 return -1;
14034
14035 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14036 str, start, len);
14037 writer->pos += len;
14038 return 0;
14039}
14040
14041int
Victor Stinner4a587072013-11-19 12:54:53 +010014042_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14043 const char *ascii, Py_ssize_t len)
14044{
14045 if (len == -1)
14046 len = strlen(ascii);
14047
Andy Lestere6be9b52020-02-11 20:28:35 -060014048 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014049
14050 if (writer->buffer == NULL && !writer->overallocate) {
14051 PyObject *str;
14052
14053 str = _PyUnicode_FromASCII(ascii, len);
14054 if (str == NULL)
14055 return -1;
14056
14057 writer->readonly = 1;
14058 writer->buffer = str;
14059 _PyUnicodeWriter_Update(writer);
14060 writer->pos += len;
14061 return 0;
14062 }
14063
14064 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14065 return -1;
14066
14067 switch (writer->kind)
14068 {
14069 case PyUnicode_1BYTE_KIND:
14070 {
14071 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14072 Py_UCS1 *data = writer->data;
14073
Christian Heimesf051e432016-09-13 20:22:02 +020014074 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014075 break;
14076 }
14077 case PyUnicode_2BYTE_KIND:
14078 {
14079 _PyUnicode_CONVERT_BYTES(
14080 Py_UCS1, Py_UCS2,
14081 ascii, ascii + len,
14082 (Py_UCS2 *)writer->data + writer->pos);
14083 break;
14084 }
14085 case PyUnicode_4BYTE_KIND:
14086 {
14087 _PyUnicode_CONVERT_BYTES(
14088 Py_UCS1, Py_UCS4,
14089 ascii, ascii + len,
14090 (Py_UCS4 *)writer->data + writer->pos);
14091 break;
14092 }
14093 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014094 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014095 }
14096
14097 writer->pos += len;
14098 return 0;
14099}
14100
14101int
14102_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14103 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014104{
14105 Py_UCS4 maxchar;
14106
Andy Lestere6be9b52020-02-11 20:28:35 -060014107 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014108 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14109 return -1;
14110 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14111 writer->pos += len;
14112 return 0;
14113}
14114
Victor Stinnerd3f08822012-05-29 12:57:52 +020014115PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014116_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014117{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014118 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014119
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014121 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014122 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014123 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014124
14125 str = writer->buffer;
14126 writer->buffer = NULL;
14127
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014128 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014129 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14130 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014131 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014132
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014133 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14134 PyObject *str2;
14135 str2 = resize_compact(str, writer->pos);
14136 if (str2 == NULL) {
14137 Py_DECREF(str);
14138 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014139 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014140 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014141 }
14142
Victor Stinner15a0bd32013-07-08 22:29:55 +020014143 assert(_PyUnicode_CheckConsistency(str, 1));
14144 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014145}
14146
Victor Stinnerd3f08822012-05-29 12:57:52 +020014147void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014148_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014149{
14150 Py_CLEAR(writer->buffer);
14151}
14152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014153#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014154
14155PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014157\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014158Return a formatted version of S, using substitutions from args and kwargs.\n\
14159The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014160
Eric Smith27bbca62010-11-04 17:06:58 +000014161PyDoc_STRVAR(format_map__doc__,
14162 "S.format_map(mapping) -> str\n\
14163\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014164Return a formatted version of S, using substitutions from mapping.\n\
14165The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014166
INADA Naoki3ae20562017-01-16 20:41:20 +090014167/*[clinic input]
14168str.__format__ as unicode___format__
14169
14170 format_spec: unicode
14171 /
14172
14173Return a formatted version of the string as described by format_spec.
14174[clinic start generated code]*/
14175
Eric Smith4a7d76d2008-05-30 18:10:19 +000014176static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014177unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014178/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014179{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014180 _PyUnicodeWriter writer;
14181 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014182
Victor Stinnerd3f08822012-05-29 12:57:52 +020014183 if (PyUnicode_READY(self) == -1)
14184 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014185 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014186 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14187 self, format_spec, 0,
14188 PyUnicode_GET_LENGTH(format_spec));
14189 if (ret == -1) {
14190 _PyUnicodeWriter_Dealloc(&writer);
14191 return NULL;
14192 }
14193 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014194}
14195
INADA Naoki3ae20562017-01-16 20:41:20 +090014196/*[clinic input]
14197str.__sizeof__ as unicode_sizeof
14198
14199Return the size of the string in memory, in bytes.
14200[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014201
14202static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014203unicode_sizeof_impl(PyObject *self)
14204/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014206 Py_ssize_t size;
14207
14208 /* If it's a compact object, account for base structure +
14209 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014210 if (PyUnicode_IS_COMPACT_ASCII(self))
14211 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14212 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014213 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014214 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014215 else {
14216 /* If it is a two-block object, account for base object, and
14217 for character block if present. */
14218 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014219 if (_PyUnicode_DATA_ANY(self))
14220 size += (PyUnicode_GET_LENGTH(self) + 1) *
14221 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014222 }
14223 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014224 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014225 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14226 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14227 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14228 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014229
14230 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014231}
14232
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014233static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014234unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014235{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014236 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014237 if (!copy)
14238 return NULL;
14239 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014240}
14241
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014243 UNICODE_ENCODE_METHODDEF
14244 UNICODE_REPLACE_METHODDEF
14245 UNICODE_SPLIT_METHODDEF
14246 UNICODE_RSPLIT_METHODDEF
14247 UNICODE_JOIN_METHODDEF
14248 UNICODE_CAPITALIZE_METHODDEF
14249 UNICODE_CASEFOLD_METHODDEF
14250 UNICODE_TITLE_METHODDEF
14251 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014252 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014253 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014254 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014255 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014256 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014257 UNICODE_LJUST_METHODDEF
14258 UNICODE_LOWER_METHODDEF
14259 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014260 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14261 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014262 UNICODE_RJUST_METHODDEF
14263 UNICODE_RSTRIP_METHODDEF
14264 UNICODE_RPARTITION_METHODDEF
14265 UNICODE_SPLITLINES_METHODDEF
14266 UNICODE_STRIP_METHODDEF
14267 UNICODE_SWAPCASE_METHODDEF
14268 UNICODE_TRANSLATE_METHODDEF
14269 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014270 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14271 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014272 UNICODE_REMOVEPREFIX_METHODDEF
14273 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014274 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014275 UNICODE_ISLOWER_METHODDEF
14276 UNICODE_ISUPPER_METHODDEF
14277 UNICODE_ISTITLE_METHODDEF
14278 UNICODE_ISSPACE_METHODDEF
14279 UNICODE_ISDECIMAL_METHODDEF
14280 UNICODE_ISDIGIT_METHODDEF
14281 UNICODE_ISNUMERIC_METHODDEF
14282 UNICODE_ISALPHA_METHODDEF
14283 UNICODE_ISALNUM_METHODDEF
14284 UNICODE_ISIDENTIFIER_METHODDEF
14285 UNICODE_ISPRINTABLE_METHODDEF
14286 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014287 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014288 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014289 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014290 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014291 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014292#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014293 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014294 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014295#endif
14296
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014297 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014298 {NULL, NULL}
14299};
14300
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014301static PyObject *
14302unicode_mod(PyObject *v, PyObject *w)
14303{
Brian Curtindfc80e32011-08-10 20:28:54 -050014304 if (!PyUnicode_Check(v))
14305 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014306 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014307}
14308
14309static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014310 0, /*nb_add*/
14311 0, /*nb_subtract*/
14312 0, /*nb_multiply*/
14313 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014314};
14315
Guido van Rossumd57fd912000-03-10 22:53:23 +000014316static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014317 (lenfunc) unicode_length, /* sq_length */
14318 PyUnicode_Concat, /* sq_concat */
14319 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14320 (ssizeargfunc) unicode_getitem, /* sq_item */
14321 0, /* sq_slice */
14322 0, /* sq_ass_item */
14323 0, /* sq_ass_slice */
14324 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014325};
14326
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014327static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014328unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014330 if (PyUnicode_READY(self) == -1)
14331 return NULL;
14332
Victor Stinnera15e2602020-04-08 02:01:56 +020014333 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014334 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014335 if (i == -1 && PyErr_Occurred())
14336 return NULL;
14337 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014338 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014339 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014340 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014341 Py_ssize_t start, stop, step, slicelength, i;
14342 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014343 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014344 const void *src_data;
14345 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014346 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014347 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014348
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014349 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014350 return NULL;
14351 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014352 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14353 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014354
14355 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014356 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014357 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014358 slicelength == PyUnicode_GET_LENGTH(self)) {
14359 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014360 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014361 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014362 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014363 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014364 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014365 src_kind = PyUnicode_KIND(self);
14366 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014367 if (!PyUnicode_IS_ASCII(self)) {
14368 kind_limit = kind_maxchar_limit(src_kind);
14369 max_char = 0;
14370 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14371 ch = PyUnicode_READ(src_kind, src_data, cur);
14372 if (ch > max_char) {
14373 max_char = ch;
14374 if (max_char >= kind_limit)
14375 break;
14376 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014377 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014378 }
Victor Stinner55c99112011-10-13 01:17:06 +020014379 else
14380 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014381 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014382 if (result == NULL)
14383 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014384 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014385 dest_data = PyUnicode_DATA(result);
14386
14387 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014388 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14389 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014390 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014391 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014392 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014393 } else {
14394 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14395 return NULL;
14396 }
14397}
14398
14399static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014400 (lenfunc)unicode_length, /* mp_length */
14401 (binaryfunc)unicode_subscript, /* mp_subscript */
14402 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014403};
14404
Guido van Rossumd57fd912000-03-10 22:53:23 +000014405
Guido van Rossumd57fd912000-03-10 22:53:23 +000014406/* Helpers for PyUnicode_Format() */
14407
Victor Stinnera47082312012-10-04 02:19:54 +020014408struct unicode_formatter_t {
14409 PyObject *args;
14410 int args_owned;
14411 Py_ssize_t arglen, argidx;
14412 PyObject *dict;
14413
14414 enum PyUnicode_Kind fmtkind;
14415 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014416 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014417 PyObject *fmtstr;
14418
14419 _PyUnicodeWriter writer;
14420};
14421
14422struct unicode_format_arg_t {
14423 Py_UCS4 ch;
14424 int flags;
14425 Py_ssize_t width;
14426 int prec;
14427 int sign;
14428};
14429
Guido van Rossumd57fd912000-03-10 22:53:23 +000014430static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014431unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014432{
Victor Stinnera47082312012-10-04 02:19:54 +020014433 Py_ssize_t argidx = ctx->argidx;
14434
14435 if (argidx < ctx->arglen) {
14436 ctx->argidx++;
14437 if (ctx->arglen < 0)
14438 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 else
Victor Stinnera47082312012-10-04 02:19:54 +020014440 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014441 }
14442 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014443 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014444 return NULL;
14445}
14446
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014447/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014448
Victor Stinnera47082312012-10-04 02:19:54 +020014449/* Format a float into the writer if the writer is not NULL, or into *p_output
14450 otherwise.
14451
14452 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014453static int
Victor Stinnera47082312012-10-04 02:19:54 +020014454formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14455 PyObject **p_output,
14456 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014457{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014458 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014459 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014460 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014461 int prec;
14462 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014463
Guido van Rossumd57fd912000-03-10 22:53:23 +000014464 x = PyFloat_AsDouble(v);
14465 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014466 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014467
Victor Stinnera47082312012-10-04 02:19:54 +020014468 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014469 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014470 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014471
Victor Stinnera47082312012-10-04 02:19:54 +020014472 if (arg->flags & F_ALT)
14473 dtoa_flags = Py_DTSF_ALT;
14474 else
14475 dtoa_flags = 0;
14476 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014477 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014478 return -1;
14479 len = strlen(p);
14480 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014481 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014482 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014483 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014484 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014485 }
14486 else
14487 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014488 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014489 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014490}
14491
Victor Stinnerd0880d52012-04-27 23:40:13 +020014492/* formatlong() emulates the format codes d, u, o, x and X, and
14493 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14494 * Python's regular ints.
14495 * Return value: a new PyUnicodeObject*, or NULL if error.
14496 * The output string is of the form
14497 * "-"? ("0x" | "0X")? digit+
14498 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14499 * set in flags. The case of hex digits will be correct,
14500 * There will be at least prec digits, zero-filled on the left if
14501 * necessary to get that many.
14502 * val object to be converted
14503 * flags bitmask of format flags; only F_ALT is looked at
14504 * prec minimum number of digits; 0-fill on left if needed
14505 * type a character in [duoxX]; u acts the same as d
14506 *
14507 * CAUTION: o, x and X conversions on regular ints can never
14508 * produce a '-' sign, but can for Python's unbounded ints.
14509 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014510PyObject *
14511_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014512{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014513 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014515 Py_ssize_t i;
14516 int sign; /* 1 if '-', else 0 */
14517 int len; /* number of characters */
14518 Py_ssize_t llen;
14519 int numdigits; /* len == numnondigits + numdigits */
14520 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014521
Victor Stinnerd0880d52012-04-27 23:40:13 +020014522 /* Avoid exceeding SSIZE_T_MAX */
14523 if (prec > INT_MAX-3) {
14524 PyErr_SetString(PyExc_OverflowError,
14525 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014526 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014527 }
14528
14529 assert(PyLong_Check(val));
14530
14531 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014532 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014533 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014534 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014535 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014536 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014537 /* int and int subclasses should print numerically when a numeric */
14538 /* format code is used (see issue18780) */
14539 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014540 break;
14541 case 'o':
14542 numnondigits = 2;
14543 result = PyNumber_ToBase(val, 8);
14544 break;
14545 case 'x':
14546 case 'X':
14547 numnondigits = 2;
14548 result = PyNumber_ToBase(val, 16);
14549 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014550 }
14551 if (!result)
14552 return NULL;
14553
14554 assert(unicode_modifiable(result));
14555 assert(PyUnicode_IS_READY(result));
14556 assert(PyUnicode_IS_ASCII(result));
14557
14558 /* To modify the string in-place, there can only be one reference. */
14559 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014560 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014561 PyErr_BadInternalCall();
14562 return NULL;
14563 }
14564 buf = PyUnicode_DATA(result);
14565 llen = PyUnicode_GET_LENGTH(result);
14566 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014567 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014568 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014569 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014570 return NULL;
14571 }
14572 len = (int)llen;
14573 sign = buf[0] == '-';
14574 numnondigits += sign;
14575 numdigits = len - numnondigits;
14576 assert(numdigits > 0);
14577
14578 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014579 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014580 (type == 'o' || type == 'x' || type == 'X'))) {
14581 assert(buf[sign] == '0');
14582 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14583 buf[sign+1] == 'o');
14584 numnondigits -= 2;
14585 buf += 2;
14586 len -= 2;
14587 if (sign)
14588 buf[0] = '-';
14589 assert(len == numnondigits + numdigits);
14590 assert(numdigits > 0);
14591 }
14592
14593 /* Fill with leading zeroes to meet minimum width. */
14594 if (prec > numdigits) {
14595 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14596 numnondigits + prec);
14597 char *b1;
14598 if (!r1) {
14599 Py_DECREF(result);
14600 return NULL;
14601 }
14602 b1 = PyBytes_AS_STRING(r1);
14603 for (i = 0; i < numnondigits; ++i)
14604 *b1++ = *buf++;
14605 for (i = 0; i < prec - numdigits; i++)
14606 *b1++ = '0';
14607 for (i = 0; i < numdigits; i++)
14608 *b1++ = *buf++;
14609 *b1 = '\0';
14610 Py_DECREF(result);
14611 result = r1;
14612 buf = PyBytes_AS_STRING(result);
14613 len = numnondigits + prec;
14614 }
14615
14616 /* Fix up case for hex conversions. */
14617 if (type == 'X') {
14618 /* Need to convert all lower case letters to upper case.
14619 and need to convert 0x to 0X (and -0x to -0X). */
14620 for (i = 0; i < len; i++)
14621 if (buf[i] >= 'a' && buf[i] <= 'x')
14622 buf[i] -= 'a'-'A';
14623 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014624 if (!PyUnicode_Check(result)
14625 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014626 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014627 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014628 Py_DECREF(result);
14629 result = unicode;
14630 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014631 else if (len != PyUnicode_GET_LENGTH(result)) {
14632 if (PyUnicode_Resize(&result, len) < 0)
14633 Py_CLEAR(result);
14634 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014635 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014636}
14637
Ethan Furmandf3ed242014-01-05 06:50:30 -080014638/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014639 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014640 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014641 * -1 and raise an exception on error */
14642static int
Victor Stinnera47082312012-10-04 02:19:54 +020014643mainformatlong(PyObject *v,
14644 struct unicode_format_arg_t *arg,
14645 PyObject **p_output,
14646 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014647{
14648 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014649 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014650
14651 if (!PyNumber_Check(v))
14652 goto wrongtype;
14653
Ethan Furman9ab74802014-03-21 06:38:46 -070014654 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014655 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014656 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014657 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014658 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014659 if (PyErr_ExceptionMatches(PyExc_TypeError))
14660 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014661 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014662 }
14663 }
14664 else {
14665 iobj = PyNumber_Long(v);
14666 if (iobj == NULL ) {
14667 if (PyErr_ExceptionMatches(PyExc_TypeError))
14668 goto wrongtype;
14669 return -1;
14670 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014671 }
14672 assert(PyLong_Check(iobj));
14673 }
14674 else {
14675 iobj = v;
14676 Py_INCREF(iobj);
14677 }
14678
14679 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014680 && arg->width == -1 && arg->prec == -1
14681 && !(arg->flags & (F_SIGN | F_BLANK))
14682 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014683 {
14684 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014685 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014686 int base;
14687
Victor Stinnera47082312012-10-04 02:19:54 +020014688 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014689 {
14690 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014691 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014692 case 'd':
14693 case 'i':
14694 case 'u':
14695 base = 10;
14696 break;
14697 case 'o':
14698 base = 8;
14699 break;
14700 case 'x':
14701 case 'X':
14702 base = 16;
14703 break;
14704 }
14705
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014706 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14707 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014708 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014709 }
14710 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014711 return 1;
14712 }
14713
Ethan Furmanb95b5612015-01-23 20:05:18 -080014714 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014715 Py_DECREF(iobj);
14716 if (res == NULL)
14717 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014718 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014719 return 0;
14720
14721wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014722 switch(type)
14723 {
14724 case 'o':
14725 case 'x':
14726 case 'X':
14727 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014728 "%%%c format: an integer is required, "
14729 "not %.200s",
14730 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014731 break;
14732 default:
14733 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014734 "%%%c format: a number is required, "
14735 "not %.200s",
14736 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014737 break;
14738 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014739 return -1;
14740}
14741
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014742static Py_UCS4
14743formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014744{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014745 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014746 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014747 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014748 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014749 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014750 goto onError;
14751 }
14752 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014753 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014754 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014755 /* make sure number is a type of integer */
14756 if (!PyLong_Check(v)) {
14757 iobj = PyNumber_Index(v);
14758 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014759 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014760 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014761 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014762 Py_DECREF(iobj);
14763 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014764 else {
14765 x = PyLong_AsLong(v);
14766 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014767 if (x == -1 && PyErr_Occurred())
14768 goto onError;
14769
Victor Stinner8faf8212011-12-08 22:14:11 +010014770 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014771 PyErr_SetString(PyExc_OverflowError,
14772 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014773 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014774 }
14775
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014776 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014777 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014778
Benjamin Peterson29060642009-01-31 22:14:21 +000014779 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014780 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014781 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014782 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014783}
14784
Victor Stinnera47082312012-10-04 02:19:54 +020014785/* Parse options of an argument: flags, width, precision.
14786 Handle also "%(name)" syntax.
14787
14788 Return 0 if the argument has been formatted into arg->str.
14789 Return 1 if the argument has been written into ctx->writer,
14790 Raise an exception and return -1 on error. */
14791static int
14792unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14793 struct unicode_format_arg_t *arg)
14794{
14795#define FORMAT_READ(ctx) \
14796 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14797
14798 PyObject *v;
14799
Victor Stinnera47082312012-10-04 02:19:54 +020014800 if (arg->ch == '(') {
14801 /* Get argument value from a dictionary. Example: "%(name)s". */
14802 Py_ssize_t keystart;
14803 Py_ssize_t keylen;
14804 PyObject *key;
14805 int pcount = 1;
14806
14807 if (ctx->dict == NULL) {
14808 PyErr_SetString(PyExc_TypeError,
14809 "format requires a mapping");
14810 return -1;
14811 }
14812 ++ctx->fmtpos;
14813 --ctx->fmtcnt;
14814 keystart = ctx->fmtpos;
14815 /* Skip over balanced parentheses */
14816 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14817 arg->ch = FORMAT_READ(ctx);
14818 if (arg->ch == ')')
14819 --pcount;
14820 else if (arg->ch == '(')
14821 ++pcount;
14822 ctx->fmtpos++;
14823 }
14824 keylen = ctx->fmtpos - keystart - 1;
14825 if (ctx->fmtcnt < 0 || pcount > 0) {
14826 PyErr_SetString(PyExc_ValueError,
14827 "incomplete format key");
14828 return -1;
14829 }
14830 key = PyUnicode_Substring(ctx->fmtstr,
14831 keystart, keystart + keylen);
14832 if (key == NULL)
14833 return -1;
14834 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014835 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014836 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014837 }
14838 ctx->args = PyObject_GetItem(ctx->dict, key);
14839 Py_DECREF(key);
14840 if (ctx->args == NULL)
14841 return -1;
14842 ctx->args_owned = 1;
14843 ctx->arglen = -1;
14844 ctx->argidx = -2;
14845 }
14846
14847 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014848 while (--ctx->fmtcnt >= 0) {
14849 arg->ch = FORMAT_READ(ctx);
14850 ctx->fmtpos++;
14851 switch (arg->ch) {
14852 case '-': arg->flags |= F_LJUST; continue;
14853 case '+': arg->flags |= F_SIGN; continue;
14854 case ' ': arg->flags |= F_BLANK; continue;
14855 case '#': arg->flags |= F_ALT; continue;
14856 case '0': arg->flags |= F_ZERO; continue;
14857 }
14858 break;
14859 }
14860
14861 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014862 if (arg->ch == '*') {
14863 v = unicode_format_getnextarg(ctx);
14864 if (v == NULL)
14865 return -1;
14866 if (!PyLong_Check(v)) {
14867 PyErr_SetString(PyExc_TypeError,
14868 "* wants int");
14869 return -1;
14870 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014871 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014872 if (arg->width == -1 && PyErr_Occurred())
14873 return -1;
14874 if (arg->width < 0) {
14875 arg->flags |= F_LJUST;
14876 arg->width = -arg->width;
14877 }
14878 if (--ctx->fmtcnt >= 0) {
14879 arg->ch = FORMAT_READ(ctx);
14880 ctx->fmtpos++;
14881 }
14882 }
14883 else if (arg->ch >= '0' && arg->ch <= '9') {
14884 arg->width = arg->ch - '0';
14885 while (--ctx->fmtcnt >= 0) {
14886 arg->ch = FORMAT_READ(ctx);
14887 ctx->fmtpos++;
14888 if (arg->ch < '0' || arg->ch > '9')
14889 break;
14890 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14891 mixing signed and unsigned comparison. Since arg->ch is between
14892 '0' and '9', casting to int is safe. */
14893 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14894 PyErr_SetString(PyExc_ValueError,
14895 "width too big");
14896 return -1;
14897 }
14898 arg->width = arg->width*10 + (arg->ch - '0');
14899 }
14900 }
14901
14902 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014903 if (arg->ch == '.') {
14904 arg->prec = 0;
14905 if (--ctx->fmtcnt >= 0) {
14906 arg->ch = FORMAT_READ(ctx);
14907 ctx->fmtpos++;
14908 }
14909 if (arg->ch == '*') {
14910 v = unicode_format_getnextarg(ctx);
14911 if (v == NULL)
14912 return -1;
14913 if (!PyLong_Check(v)) {
14914 PyErr_SetString(PyExc_TypeError,
14915 "* wants int");
14916 return -1;
14917 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014918 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014919 if (arg->prec == -1 && PyErr_Occurred())
14920 return -1;
14921 if (arg->prec < 0)
14922 arg->prec = 0;
14923 if (--ctx->fmtcnt >= 0) {
14924 arg->ch = FORMAT_READ(ctx);
14925 ctx->fmtpos++;
14926 }
14927 }
14928 else if (arg->ch >= '0' && arg->ch <= '9') {
14929 arg->prec = arg->ch - '0';
14930 while (--ctx->fmtcnt >= 0) {
14931 arg->ch = FORMAT_READ(ctx);
14932 ctx->fmtpos++;
14933 if (arg->ch < '0' || arg->ch > '9')
14934 break;
14935 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14936 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014937 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014938 return -1;
14939 }
14940 arg->prec = arg->prec*10 + (arg->ch - '0');
14941 }
14942 }
14943 }
14944
14945 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14946 if (ctx->fmtcnt >= 0) {
14947 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14948 if (--ctx->fmtcnt >= 0) {
14949 arg->ch = FORMAT_READ(ctx);
14950 ctx->fmtpos++;
14951 }
14952 }
14953 }
14954 if (ctx->fmtcnt < 0) {
14955 PyErr_SetString(PyExc_ValueError,
14956 "incomplete format");
14957 return -1;
14958 }
14959 return 0;
14960
14961#undef FORMAT_READ
14962}
14963
14964/* Format one argument. Supported conversion specifiers:
14965
14966 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014967 - "i", "d", "u": int or float
14968 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014969 - "e", "E", "f", "F", "g", "G": float
14970 - "c": int or str (1 character)
14971
Victor Stinner8dbd4212012-12-04 09:30:24 +010014972 When possible, the output is written directly into the Unicode writer
14973 (ctx->writer). A string is created when padding is required.
14974
Victor Stinnera47082312012-10-04 02:19:54 +020014975 Return 0 if the argument has been formatted into *p_str,
14976 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014977 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014978static int
14979unicode_format_arg_format(struct unicode_formatter_t *ctx,
14980 struct unicode_format_arg_t *arg,
14981 PyObject **p_str)
14982{
14983 PyObject *v;
14984 _PyUnicodeWriter *writer = &ctx->writer;
14985
14986 if (ctx->fmtcnt == 0)
14987 ctx->writer.overallocate = 0;
14988
Victor Stinnera47082312012-10-04 02:19:54 +020014989 v = unicode_format_getnextarg(ctx);
14990 if (v == NULL)
14991 return -1;
14992
Victor Stinnera47082312012-10-04 02:19:54 +020014993
14994 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014995 case 's':
14996 case 'r':
14997 case 'a':
14998 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14999 /* Fast path */
15000 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15001 return -1;
15002 return 1;
15003 }
15004
15005 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15006 *p_str = v;
15007 Py_INCREF(*p_str);
15008 }
15009 else {
15010 if (arg->ch == 's')
15011 *p_str = PyObject_Str(v);
15012 else if (arg->ch == 'r')
15013 *p_str = PyObject_Repr(v);
15014 else
15015 *p_str = PyObject_ASCII(v);
15016 }
15017 break;
15018
15019 case 'i':
15020 case 'd':
15021 case 'u':
15022 case 'o':
15023 case 'x':
15024 case 'X':
15025 {
15026 int ret = mainformatlong(v, arg, p_str, writer);
15027 if (ret != 0)
15028 return ret;
15029 arg->sign = 1;
15030 break;
15031 }
15032
15033 case 'e':
15034 case 'E':
15035 case 'f':
15036 case 'F':
15037 case 'g':
15038 case 'G':
15039 if (arg->width == -1 && arg->prec == -1
15040 && !(arg->flags & (F_SIGN | F_BLANK)))
15041 {
15042 /* Fast path */
15043 if (formatfloat(v, arg, NULL, writer) == -1)
15044 return -1;
15045 return 1;
15046 }
15047
15048 arg->sign = 1;
15049 if (formatfloat(v, arg, p_str, NULL) == -1)
15050 return -1;
15051 break;
15052
15053 case 'c':
15054 {
15055 Py_UCS4 ch = formatchar(v);
15056 if (ch == (Py_UCS4) -1)
15057 return -1;
15058 if (arg->width == -1 && arg->prec == -1) {
15059 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015060 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015061 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015062 return 1;
15063 }
15064 *p_str = PyUnicode_FromOrdinal(ch);
15065 break;
15066 }
15067
15068 default:
15069 PyErr_Format(PyExc_ValueError,
15070 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015071 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015072 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15073 (int)arg->ch,
15074 ctx->fmtpos - 1);
15075 return -1;
15076 }
15077 if (*p_str == NULL)
15078 return -1;
15079 assert (PyUnicode_Check(*p_str));
15080 return 0;
15081}
15082
15083static int
15084unicode_format_arg_output(struct unicode_formatter_t *ctx,
15085 struct unicode_format_arg_t *arg,
15086 PyObject *str)
15087{
15088 Py_ssize_t len;
15089 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015090 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015091 Py_ssize_t pindex;
15092 Py_UCS4 signchar;
15093 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015094 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015095 Py_ssize_t sublen;
15096 _PyUnicodeWriter *writer = &ctx->writer;
15097 Py_UCS4 fill;
15098
15099 fill = ' ';
15100 if (arg->sign && arg->flags & F_ZERO)
15101 fill = '0';
15102
15103 if (PyUnicode_READY(str) == -1)
15104 return -1;
15105
15106 len = PyUnicode_GET_LENGTH(str);
15107 if ((arg->width == -1 || arg->width <= len)
15108 && (arg->prec == -1 || arg->prec >= len)
15109 && !(arg->flags & (F_SIGN | F_BLANK)))
15110 {
15111 /* Fast path */
15112 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15113 return -1;
15114 return 0;
15115 }
15116
15117 /* Truncate the string for "s", "r" and "a" formats
15118 if the precision is set */
15119 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15120 if (arg->prec >= 0 && len > arg->prec)
15121 len = arg->prec;
15122 }
15123
15124 /* Adjust sign and width */
15125 kind = PyUnicode_KIND(str);
15126 pbuf = PyUnicode_DATA(str);
15127 pindex = 0;
15128 signchar = '\0';
15129 if (arg->sign) {
15130 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15131 if (ch == '-' || ch == '+') {
15132 signchar = ch;
15133 len--;
15134 pindex++;
15135 }
15136 else if (arg->flags & F_SIGN)
15137 signchar = '+';
15138 else if (arg->flags & F_BLANK)
15139 signchar = ' ';
15140 else
15141 arg->sign = 0;
15142 }
15143 if (arg->width < len)
15144 arg->width = len;
15145
15146 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015147 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015148 if (!(arg->flags & F_LJUST)) {
15149 if (arg->sign) {
15150 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015151 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015152 }
15153 else {
15154 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015155 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015156 }
15157 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015158 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15159 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015160 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015161 }
15162
Victor Stinnera47082312012-10-04 02:19:54 +020015163 buflen = arg->width;
15164 if (arg->sign && len == arg->width)
15165 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015166 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015167 return -1;
15168
15169 /* Write the sign if needed */
15170 if (arg->sign) {
15171 if (fill != ' ') {
15172 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15173 writer->pos += 1;
15174 }
15175 if (arg->width > len)
15176 arg->width--;
15177 }
15178
15179 /* Write the numeric prefix for "x", "X" and "o" formats
15180 if the alternate form is used.
15181 For example, write "0x" for the "%#x" format. */
15182 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15183 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15184 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15185 if (fill != ' ') {
15186 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15187 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15188 writer->pos += 2;
15189 pindex += 2;
15190 }
15191 arg->width -= 2;
15192 if (arg->width < 0)
15193 arg->width = 0;
15194 len -= 2;
15195 }
15196
15197 /* Pad left with the fill character if needed */
15198 if (arg->width > len && !(arg->flags & F_LJUST)) {
15199 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015200 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015201 writer->pos += sublen;
15202 arg->width = len;
15203 }
15204
15205 /* If padding with spaces: write sign if needed and/or numeric prefix if
15206 the alternate form is used */
15207 if (fill == ' ') {
15208 if (arg->sign) {
15209 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15210 writer->pos += 1;
15211 }
15212 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15213 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15214 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15215 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15216 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15217 writer->pos += 2;
15218 pindex += 2;
15219 }
15220 }
15221
15222 /* Write characters */
15223 if (len) {
15224 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15225 str, pindex, len);
15226 writer->pos += len;
15227 }
15228
15229 /* Pad right with the fill character if needed */
15230 if (arg->width > len) {
15231 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015232 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015233 writer->pos += sublen;
15234 }
15235 return 0;
15236}
15237
15238/* Helper of PyUnicode_Format(): format one arg.
15239 Return 0 on success, raise an exception and return -1 on error. */
15240static int
15241unicode_format_arg(struct unicode_formatter_t *ctx)
15242{
15243 struct unicode_format_arg_t arg;
15244 PyObject *str;
15245 int ret;
15246
Victor Stinner8dbd4212012-12-04 09:30:24 +010015247 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015248 if (arg.ch == '%') {
15249 ctx->fmtpos++;
15250 ctx->fmtcnt--;
15251 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15252 return -1;
15253 return 0;
15254 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015255 arg.flags = 0;
15256 arg.width = -1;
15257 arg.prec = -1;
15258 arg.sign = 0;
15259 str = NULL;
15260
Victor Stinnera47082312012-10-04 02:19:54 +020015261 ret = unicode_format_arg_parse(ctx, &arg);
15262 if (ret == -1)
15263 return -1;
15264
15265 ret = unicode_format_arg_format(ctx, &arg, &str);
15266 if (ret == -1)
15267 return -1;
15268
15269 if (ret != 1) {
15270 ret = unicode_format_arg_output(ctx, &arg, str);
15271 Py_DECREF(str);
15272 if (ret == -1)
15273 return -1;
15274 }
15275
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015276 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015277 PyErr_SetString(PyExc_TypeError,
15278 "not all arguments converted during string formatting");
15279 return -1;
15280 }
15281 return 0;
15282}
15283
Alexander Belopolsky40018472011-02-26 01:02:56 +000015284PyObject *
15285PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015286{
Victor Stinnera47082312012-10-04 02:19:54 +020015287 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015288
Guido van Rossumd57fd912000-03-10 22:53:23 +000015289 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015290 PyErr_BadInternalCall();
15291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015292 }
Victor Stinnera47082312012-10-04 02:19:54 +020015293
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015294 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015295 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015296
15297 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015298 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15299 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15300 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15301 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015302
Victor Stinner8f674cc2013-04-17 23:02:17 +020015303 _PyUnicodeWriter_Init(&ctx.writer);
15304 ctx.writer.min_length = ctx.fmtcnt + 100;
15305 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015306
Guido van Rossumd57fd912000-03-10 22:53:23 +000015307 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015308 ctx.arglen = PyTuple_Size(args);
15309 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015310 }
15311 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015312 ctx.arglen = -1;
15313 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015314 }
Victor Stinnera47082312012-10-04 02:19:54 +020015315 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015316 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015317 ctx.dict = args;
15318 else
15319 ctx.dict = NULL;
15320 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015321
Victor Stinnera47082312012-10-04 02:19:54 +020015322 while (--ctx.fmtcnt >= 0) {
15323 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015324 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015325
15326 nonfmtpos = ctx.fmtpos++;
15327 while (ctx.fmtcnt >= 0 &&
15328 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15329 ctx.fmtpos++;
15330 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 }
Victor Stinnera47082312012-10-04 02:19:54 +020015332 if (ctx.fmtcnt < 0) {
15333 ctx.fmtpos--;
15334 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015335 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015336
Victor Stinnercfc4c132013-04-03 01:48:39 +020015337 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15338 nonfmtpos, ctx.fmtpos) < 0)
15339 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 }
15341 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015342 ctx.fmtpos++;
15343 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015344 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015345 }
15346 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015347
Victor Stinnera47082312012-10-04 02:19:54 +020015348 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015349 PyErr_SetString(PyExc_TypeError,
15350 "not all arguments converted during string formatting");
15351 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015352 }
15353
Victor Stinnera47082312012-10-04 02:19:54 +020015354 if (ctx.args_owned) {
15355 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015356 }
Victor Stinnera47082312012-10-04 02:19:54 +020015357 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015358
Benjamin Peterson29060642009-01-31 22:14:21 +000015359 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015360 _PyUnicodeWriter_Dealloc(&ctx.writer);
15361 if (ctx.args_owned) {
15362 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015363 }
15364 return NULL;
15365}
15366
Jeremy Hylton938ace62002-07-17 16:30:39 +000015367static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015368unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15369
Tim Peters6d6c1a32001-08-02 04:15:00 +000015370static PyObject *
15371unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15372{
Benjamin Peterson29060642009-01-31 22:14:21 +000015373 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015374 static char *kwlist[] = {"object", "encoding", "errors", 0};
15375 char *encoding = NULL;
15376 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015377
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 if (type != &PyUnicode_Type)
15379 return unicode_subtype_new(type, args, kwds);
15380 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015381 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 return NULL;
15383 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015384 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015385 if (encoding == NULL && errors == NULL)
15386 return PyObject_Str(x);
15387 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015388 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015389}
15390
Guido van Rossume023fe02001-08-30 03:12:59 +000015391static PyObject *
15392unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15393{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015394 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015395 Py_ssize_t length, char_size;
15396 int share_wstr, share_utf8;
15397 unsigned int kind;
15398 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015399
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015401
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015402 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015403 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015404 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015405 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015406 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015407 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015408 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015409 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015410
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015411 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015412 if (self == NULL) {
15413 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 return NULL;
15415 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015416 kind = PyUnicode_KIND(unicode);
15417 length = PyUnicode_GET_LENGTH(unicode);
15418
15419 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015420#ifdef Py_DEBUG
15421 _PyUnicode_HASH(self) = -1;
15422#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015423 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015424#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015425 _PyUnicode_STATE(self).interned = 0;
15426 _PyUnicode_STATE(self).kind = kind;
15427 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015428 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015429 _PyUnicode_STATE(self).ready = 1;
15430 _PyUnicode_WSTR(self) = NULL;
15431 _PyUnicode_UTF8_LENGTH(self) = 0;
15432 _PyUnicode_UTF8(self) = NULL;
15433 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015434 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015435
15436 share_utf8 = 0;
15437 share_wstr = 0;
15438 if (kind == PyUnicode_1BYTE_KIND) {
15439 char_size = 1;
15440 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15441 share_utf8 = 1;
15442 }
15443 else if (kind == PyUnicode_2BYTE_KIND) {
15444 char_size = 2;
15445 if (sizeof(wchar_t) == 2)
15446 share_wstr = 1;
15447 }
15448 else {
15449 assert(kind == PyUnicode_4BYTE_KIND);
15450 char_size = 4;
15451 if (sizeof(wchar_t) == 4)
15452 share_wstr = 1;
15453 }
15454
15455 /* Ensure we won't overflow the length. */
15456 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15457 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015458 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015460 data = PyObject_MALLOC((length + 1) * char_size);
15461 if (data == NULL) {
15462 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015463 goto onError;
15464 }
15465
Victor Stinnerc3c74152011-10-02 20:39:55 +020015466 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015467 if (share_utf8) {
15468 _PyUnicode_UTF8_LENGTH(self) = length;
15469 _PyUnicode_UTF8(self) = data;
15470 }
15471 if (share_wstr) {
15472 _PyUnicode_WSTR_LENGTH(self) = length;
15473 _PyUnicode_WSTR(self) = (wchar_t *)data;
15474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015475
Christian Heimesf051e432016-09-13 20:22:02 +020015476 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015477 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015478 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015479#ifdef Py_DEBUG
15480 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15481#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015482 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015483 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015484
15485onError:
15486 Py_DECREF(unicode);
15487 Py_DECREF(self);
15488 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015489}
15490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015491PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015492"str(object='') -> str\n\
15493str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015494\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015495Create a new string object from the given object. If encoding or\n\
15496errors is specified, then the object must expose a data buffer\n\
15497that will be decoded using the given encoding and error handler.\n\
15498Otherwise, returns the result of object.__str__() (if defined)\n\
15499or repr(object).\n\
15500encoding defaults to sys.getdefaultencoding().\n\
15501errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015502
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015503static PyObject *unicode_iter(PyObject *seq);
15504
Guido van Rossumd57fd912000-03-10 22:53:23 +000015505PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015506 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015507 "str", /* tp_name */
15508 sizeof(PyUnicodeObject), /* tp_basicsize */
15509 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015510 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015511 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015512 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015513 0, /* tp_getattr */
15514 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015515 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015516 unicode_repr, /* tp_repr */
15517 &unicode_as_number, /* tp_as_number */
15518 &unicode_as_sequence, /* tp_as_sequence */
15519 &unicode_as_mapping, /* tp_as_mapping */
15520 (hashfunc) unicode_hash, /* tp_hash*/
15521 0, /* tp_call*/
15522 (reprfunc) unicode_str, /* tp_str */
15523 PyObject_GenericGetAttr, /* tp_getattro */
15524 0, /* tp_setattro */
15525 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015526 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015527 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15528 unicode_doc, /* tp_doc */
15529 0, /* tp_traverse */
15530 0, /* tp_clear */
15531 PyUnicode_RichCompare, /* tp_richcompare */
15532 0, /* tp_weaklistoffset */
15533 unicode_iter, /* tp_iter */
15534 0, /* tp_iternext */
15535 unicode_methods, /* tp_methods */
15536 0, /* tp_members */
15537 0, /* tp_getset */
15538 &PyBaseObject_Type, /* tp_base */
15539 0, /* tp_dict */
15540 0, /* tp_descr_get */
15541 0, /* tp_descr_set */
15542 0, /* tp_dictoffset */
15543 0, /* tp_init */
15544 0, /* tp_alloc */
15545 unicode_new, /* tp_new */
15546 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015547};
15548
15549/* Initialize the Unicode implementation */
15550
Victor Stinner331a6a52019-05-27 16:39:22 +020015551PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015552_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015553{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015554 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015555 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015556 0x000A, /* LINE FEED */
15557 0x000D, /* CARRIAGE RETURN */
15558 0x001C, /* FILE SEPARATOR */
15559 0x001D, /* GROUP SEPARATOR */
15560 0x001E, /* RECORD SEPARATOR */
15561 0x0085, /* NEXT LINE */
15562 0x2028, /* LINE SEPARATOR */
15563 0x2029, /* PARAGRAPH SEPARATOR */
15564 };
15565
Victor Stinner91698d82020-06-25 14:07:40 +020015566 struct _Py_unicode_state *state = &tstate->interp->unicode;
15567 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015568 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015569 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015570
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015571 if (_Py_IsMainInterpreter(tstate)) {
15572 /* initialize the linebreak bloom filter */
15573 bloom_linebreak = make_bloom_mask(
15574 PyUnicode_2BYTE_KIND, linebreak,
15575 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015576
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015577 if (PyType_Ready(&PyUnicode_Type) < 0) {
15578 return _PyStatus_ERR("Can't initialize unicode type");
15579 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015580
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015581 if (PyType_Ready(&EncodingMapType) < 0) {
15582 return _PyStatus_ERR("Can't initialize encoding map type");
15583 }
15584 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15585 return _PyStatus_ERR("Can't initialize field name iterator type");
15586 }
15587 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15588 return _PyStatus_ERR("Can't initialize formatter iter type");
15589 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015590 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015591 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015592}
15593
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015594
Walter Dörwald16807132007-05-25 13:52:07 +000015595void
15596PyUnicode_InternInPlace(PyObject **p)
15597{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015598 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015599#ifdef Py_DEBUG
15600 assert(s != NULL);
15601 assert(_PyUnicode_CHECK(s));
15602#else
Victor Stinner607b1022020-05-05 18:50:30 +020015603 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015604 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015605 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015606#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015607
Benjamin Peterson14339b62009-01-31 16:36:08 +000015608 /* If it's a subclass, we don't really know what putting
15609 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015610 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015611 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015612 }
15613
15614 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015615 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015616 }
15617
15618#ifdef INTERNED_STRINGS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015619 if (interned == NULL) {
15620 interned = PyDict_New();
15621 if (interned == NULL) {
15622 PyErr_Clear(); /* Don't leave an exception */
15623 return;
15624 }
15625 }
Victor Stinner607b1022020-05-05 18:50:30 +020015626
15627 PyObject *t;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015628 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015629 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015630 Py_END_ALLOW_RECURSION
Victor Stinner607b1022020-05-05 18:50:30 +020015631
Berker Peksagced8d4c2016-07-25 04:40:39 +030015632 if (t == NULL) {
15633 PyErr_Clear();
15634 return;
15635 }
Victor Stinner607b1022020-05-05 18:50:30 +020015636
Berker Peksagced8d4c2016-07-25 04:40:39 +030015637 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015638 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015639 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015640 return;
15641 }
Victor Stinner607b1022020-05-05 18:50:30 +020015642
Benjamin Peterson14339b62009-01-31 16:36:08 +000015643 /* The two references in interned are not counted by refcnt.
15644 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015645 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015646 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Victor Stinner607b1022020-05-05 18:50:30 +020015647#endif
Walter Dörwald16807132007-05-25 13:52:07 +000015648}
15649
15650void
15651PyUnicode_InternImmortal(PyObject **p)
15652{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015653 PyUnicode_InternInPlace(p);
15654 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015655 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015656 Py_INCREF(*p);
15657 }
Walter Dörwald16807132007-05-25 13:52:07 +000015658}
15659
15660PyObject *
15661PyUnicode_InternFromString(const char *cp)
15662{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015663 PyObject *s = PyUnicode_FromString(cp);
15664 if (s == NULL)
15665 return NULL;
15666 PyUnicode_InternInPlace(&s);
15667 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015668}
15669
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015670
15671#if defined(WITH_VALGRIND) || defined(__INSURE__)
15672static void
15673unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015674{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015675 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015676 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015677 }
15678 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015679 if (keys == NULL || !PyList_Check(keys)) {
15680 PyErr_Clear();
15681 return;
15682 }
Walter Dörwald16807132007-05-25 13:52:07 +000015683
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015684 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015685 detector, interned unicode strings are not forcibly deallocated;
15686 rather, we give them their stolen references back, and then clear
15687 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015688
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015689 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015690#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015691 fprintf(stderr, "releasing %zd interned strings\n", n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015692
15693 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015694#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015695 for (Py_ssize_t i = 0; i < n; i++) {
15696 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015697 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015698 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015700 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015701 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015702 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015703#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015704 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015705#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015706 break;
15707 case SSTATE_INTERNED_MORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015708 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015709#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015710 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015711#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015712 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015713 case SSTATE_NOT_INTERNED:
15714 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015715 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015716 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015718 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015719 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015720#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015721 fprintf(stderr,
15722 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15723 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015724#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015725 Py_DECREF(keys);
15726 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015727 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015728}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015729#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015730
15731
15732/********************* Unicode Iterator **************************/
15733
15734typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015735 PyObject_HEAD
15736 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015737 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015738} unicodeiterobject;
15739
15740static void
15741unicodeiter_dealloc(unicodeiterobject *it)
15742{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015743 _PyObject_GC_UNTRACK(it);
15744 Py_XDECREF(it->it_seq);
15745 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015746}
15747
15748static int
15749unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15750{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015751 Py_VISIT(it->it_seq);
15752 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015753}
15754
15755static PyObject *
15756unicodeiter_next(unicodeiterobject *it)
15757{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015758 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015759
Benjamin Peterson14339b62009-01-31 16:36:08 +000015760 assert(it != NULL);
15761 seq = it->it_seq;
15762 if (seq == NULL)
15763 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015764 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015766 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15767 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015768 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015769 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15770 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015771 if (item != NULL)
15772 ++it->it_index;
15773 return item;
15774 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015775
Benjamin Peterson14339b62009-01-31 16:36:08 +000015776 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015777 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015778 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015779}
15780
15781static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015782unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015783{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015784 Py_ssize_t len = 0;
15785 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015786 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015787 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015788}
15789
15790PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15791
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015792static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015793unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015794{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015795 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015796 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015797 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015798 it->it_seq, it->it_index);
15799 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015800 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015801 if (u == NULL)
15802 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015803 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015804 }
15805}
15806
15807PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15808
15809static PyObject *
15810unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15811{
15812 Py_ssize_t index = PyLong_AsSsize_t(state);
15813 if (index == -1 && PyErr_Occurred())
15814 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015815 if (it->it_seq != NULL) {
15816 if (index < 0)
15817 index = 0;
15818 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15819 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15820 it->it_index = index;
15821 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015822 Py_RETURN_NONE;
15823}
15824
15825PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15826
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015827static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015828 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015829 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015830 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15831 reduce_doc},
15832 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15833 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015834 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015835};
15836
15837PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015838 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15839 "str_iterator", /* tp_name */
15840 sizeof(unicodeiterobject), /* tp_basicsize */
15841 0, /* tp_itemsize */
15842 /* methods */
15843 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015844 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 0, /* tp_getattr */
15846 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015847 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015848 0, /* tp_repr */
15849 0, /* tp_as_number */
15850 0, /* tp_as_sequence */
15851 0, /* tp_as_mapping */
15852 0, /* tp_hash */
15853 0, /* tp_call */
15854 0, /* tp_str */
15855 PyObject_GenericGetAttr, /* tp_getattro */
15856 0, /* tp_setattro */
15857 0, /* tp_as_buffer */
15858 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15859 0, /* tp_doc */
15860 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15861 0, /* tp_clear */
15862 0, /* tp_richcompare */
15863 0, /* tp_weaklistoffset */
15864 PyObject_SelfIter, /* tp_iter */
15865 (iternextfunc)unicodeiter_next, /* tp_iternext */
15866 unicodeiter_methods, /* tp_methods */
15867 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015868};
15869
15870static PyObject *
15871unicode_iter(PyObject *seq)
15872{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015873 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015874
Benjamin Peterson14339b62009-01-31 16:36:08 +000015875 if (!PyUnicode_Check(seq)) {
15876 PyErr_BadInternalCall();
15877 return NULL;
15878 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015879 if (PyUnicode_READY(seq) == -1)
15880 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015881 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15882 if (it == NULL)
15883 return NULL;
15884 it->it_index = 0;
15885 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015886 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015887 _PyObject_GC_TRACK(it);
15888 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015889}
15890
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015891
15892size_t
15893Py_UNICODE_strlen(const Py_UNICODE *u)
15894{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015895 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015896}
15897
15898Py_UNICODE*
15899Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15900{
15901 Py_UNICODE *u = s1;
15902 while ((*u++ = *s2++));
15903 return s1;
15904}
15905
15906Py_UNICODE*
15907Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15908{
15909 Py_UNICODE *u = s1;
15910 while ((*u++ = *s2++))
15911 if (n-- == 0)
15912 break;
15913 return s1;
15914}
15915
15916Py_UNICODE*
15917Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15918{
15919 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015920 u1 += wcslen(u1);
15921 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015922 return s1;
15923}
15924
15925int
15926Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15927{
15928 while (*s1 && *s2 && *s1 == *s2)
15929 s1++, s2++;
15930 if (*s1 && *s2)
15931 return (*s1 < *s2) ? -1 : +1;
15932 if (*s1)
15933 return 1;
15934 if (*s2)
15935 return -1;
15936 return 0;
15937}
15938
15939int
15940Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15941{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015942 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015943 for (; n != 0; n--) {
15944 u1 = *s1;
15945 u2 = *s2;
15946 if (u1 != u2)
15947 return (u1 < u2) ? -1 : +1;
15948 if (u1 == '\0')
15949 return 0;
15950 s1++;
15951 s2++;
15952 }
15953 return 0;
15954}
15955
15956Py_UNICODE*
15957Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15958{
15959 const Py_UNICODE *p;
15960 for (p = s; *p; p++)
15961 if (*p == c)
15962 return (Py_UNICODE*)p;
15963 return NULL;
15964}
15965
15966Py_UNICODE*
15967Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15968{
15969 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015970 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015971 while (p != s) {
15972 p--;
15973 if (*p == c)
15974 return (Py_UNICODE*)p;
15975 }
15976 return NULL;
15977}
Victor Stinner331ea922010-08-10 16:37:20 +000015978
Victor Stinner71133ff2010-09-01 23:43:53 +000015979Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015980PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015981{
Victor Stinner577db2c2011-10-11 22:12:48 +020015982 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015983 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015985 if (!PyUnicode_Check(unicode)) {
15986 PyErr_BadArgument();
15987 return NULL;
15988 }
Inada Naoki2c4928d2020-06-17 20:09:44 +090015989_Py_COMP_DIAG_PUSH
15990_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015991 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Inada Naoki2c4928d2020-06-17 20:09:44 +090015992_Py_COMP_DIAG_POP
Victor Stinner577db2c2011-10-11 22:12:48 +020015993 if (u == NULL)
15994 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015995 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015996 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015997 PyErr_NoMemory();
15998 return NULL;
15999 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020016000 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000016001 size *= sizeof(Py_UNICODE);
16002 copy = PyMem_Malloc(size);
16003 if (copy == NULL) {
16004 PyErr_NoMemory();
16005 return NULL;
16006 }
Victor Stinner577db2c2011-10-11 22:12:48 +020016007 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000016008 return copy;
16009}
Martin v. Löwis5b222132007-06-10 09:51:05 +000016010
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016011
Victor Stinner709d23d2019-05-02 14:56:30 -040016012static int
16013encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016014{
Victor Stinner709d23d2019-05-02 14:56:30 -040016015 int res;
16016 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16017 if (res == -2) {
16018 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16019 return -1;
16020 }
16021 if (res < 0) {
16022 PyErr_NoMemory();
16023 return -1;
16024 }
16025 return 0;
16026}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016027
Victor Stinner709d23d2019-05-02 14:56:30 -040016028
16029static int
16030config_get_codec_name(wchar_t **config_encoding)
16031{
16032 char *encoding;
16033 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16034 return -1;
16035 }
16036
16037 PyObject *name_obj = NULL;
16038 PyObject *codec = _PyCodec_Lookup(encoding);
16039 PyMem_RawFree(encoding);
16040
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016041 if (!codec)
16042 goto error;
16043
16044 name_obj = PyObject_GetAttrString(codec, "name");
16045 Py_CLEAR(codec);
16046 if (!name_obj) {
16047 goto error;
16048 }
16049
Victor Stinner709d23d2019-05-02 14:56:30 -040016050 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16051 Py_DECREF(name_obj);
16052 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016053 goto error;
16054 }
16055
Victor Stinner709d23d2019-05-02 14:56:30 -040016056 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16057 if (raw_wname == NULL) {
16058 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016059 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016060 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016061 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016062
16063 PyMem_RawFree(*config_encoding);
16064 *config_encoding = raw_wname;
16065
16066 PyMem_Free(wname);
16067 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016068
16069error:
16070 Py_XDECREF(codec);
16071 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016072 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016073}
16074
16075
Victor Stinner331a6a52019-05-27 16:39:22 +020016076static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016077init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016078{
Victor Stinner709d23d2019-05-02 14:56:30 -040016079 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016080 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016081 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016082 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016083 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016084 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016085 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016086}
16087
16088
Victor Stinner709d23d2019-05-02 14:56:30 -040016089static int
16090init_fs_codec(PyInterpreterState *interp)
16091{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016092 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016093
16094 _Py_error_handler error_handler;
16095 error_handler = get_error_handler_wide(config->filesystem_errors);
16096 if (error_handler == _Py_ERROR_UNKNOWN) {
16097 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16098 return -1;
16099 }
16100
16101 char *encoding, *errors;
16102 if (encode_wstr_utf8(config->filesystem_encoding,
16103 &encoding,
16104 "filesystem_encoding") < 0) {
16105 return -1;
16106 }
16107
16108 if (encode_wstr_utf8(config->filesystem_errors,
16109 &errors,
16110 "filesystem_errors") < 0) {
16111 PyMem_RawFree(encoding);
16112 return -1;
16113 }
16114
Victor Stinner3d17c042020-05-14 01:48:38 +020016115 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16116 PyMem_RawFree(fs_codec->encoding);
16117 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016118 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016119 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16120 PyMem_RawFree(fs_codec->errors);
16121 fs_codec->errors = errors;
16122 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016123
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016124#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016125 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016126#endif
16127
Victor Stinner709d23d2019-05-02 14:56:30 -040016128 /* At this point, PyUnicode_EncodeFSDefault() and
16129 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16130 the C implementation of the filesystem encoding. */
16131
16132 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16133 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016134 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16135 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016136 PyErr_NoMemory();
16137 return -1;
16138 }
16139 return 0;
16140}
16141
16142
Victor Stinner331a6a52019-05-27 16:39:22 +020016143static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016144init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016145{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016146 PyInterpreterState *interp = tstate->interp;
16147
Victor Stinner709d23d2019-05-02 14:56:30 -040016148 /* Update the filesystem encoding to the normalized Python codec name.
16149 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16150 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016151 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016152 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016153 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016154 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016155 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016156 }
16157
Victor Stinner709d23d2019-05-02 14:56:30 -040016158 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016159 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016160 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016161 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016162}
16163
16164
Victor Stinner331a6a52019-05-27 16:39:22 +020016165PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016166_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016167{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016168 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016169 if (_PyStatus_EXCEPTION(status)) {
16170 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016171 }
16172
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016173 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016174}
16175
16176
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016177static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016178_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016179{
Victor Stinner3d17c042020-05-14 01:48:38 +020016180 PyMem_RawFree(fs_codec->encoding);
16181 fs_codec->encoding = NULL;
16182 fs_codec->utf8 = 0;
16183 PyMem_RawFree(fs_codec->errors);
16184 fs_codec->errors = NULL;
16185 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016186}
16187
16188
Victor Stinner709d23d2019-05-02 14:56:30 -040016189#ifdef MS_WINDOWS
16190int
16191_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16192{
Victor Stinner81a7be32020-04-14 15:14:01 +020016193 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016194 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016195
16196 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16197 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16198 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16199 if (encoding == NULL || errors == NULL) {
16200 PyMem_RawFree(encoding);
16201 PyMem_RawFree(errors);
16202 PyErr_NoMemory();
16203 return -1;
16204 }
16205
16206 PyMem_RawFree(config->filesystem_encoding);
16207 config->filesystem_encoding = encoding;
16208 PyMem_RawFree(config->filesystem_errors);
16209 config->filesystem_errors = errors;
16210
16211 return init_fs_codec(interp);
16212}
16213#endif
16214
16215
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016216void
Victor Stinner3d483342019-11-22 12:27:50 +010016217_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016218{
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016219 struct _Py_unicode_state *state = &tstate->interp->unicode;
16220
16221 int is_main_interp = _Py_IsMainInterpreter(tstate);
16222 if (is_main_interp) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016223#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016224 /* Insure++ is a memory analysis tool that aids in discovering
16225 * memory leaks and other memory problems. On Python exit, the
16226 * interned string dictionaries are flagged as being in use at exit
16227 * (which it is). Under normal circumstances, this is fine because
16228 * the memory will be automatically reclaimed by the system. Under
16229 * memory debugging, it's a huge source of useless noise, so we
16230 * trade off slower shutdown for less distraction in the memory
16231 * reports. -baw
16232 */
16233 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016234#endif /* __INSURE__ */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016235 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016236
Victor Stinner91698d82020-06-25 14:07:40 +020016237 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016238
Victor Stinner2f9ada92020-06-24 02:22:21 +020016239 for (Py_ssize_t i = 0; i < 256; i++) {
16240 Py_CLEAR(state->latin1[i]);
16241 }
16242
Victor Stinnerf363d0a2020-06-24 00:10:40 +020016243 if (is_main_interp) {
Victor Stinnerd6fb53f2020-05-14 01:11:54 +020016244 unicode_clear_static_strings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016245 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016246
Victor Stinner3d17c042020-05-14 01:48:38 +020016247 _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016248}
16249
16250
Georg Brandl66c221e2010-10-14 07:04:07 +000016251/* A _string module, to export formatter_parser and formatter_field_name_split
16252 to the string.Formatter class implemented in Python. */
16253
16254static PyMethodDef _string_methods[] = {
16255 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16256 METH_O, PyDoc_STR("split the argument as a field name")},
16257 {"formatter_parser", (PyCFunction) formatter_parser,
16258 METH_O, PyDoc_STR("parse the argument as a format string")},
16259 {NULL, NULL}
16260};
16261
16262static struct PyModuleDef _string_module = {
16263 PyModuleDef_HEAD_INIT,
16264 "_string",
16265 PyDoc_STR("string helper module"),
16266 0,
16267 _string_methods,
16268 NULL,
16269 NULL,
16270 NULL,
16271 NULL
16272};
16273
16274PyMODINIT_FUNC
16275PyInit__string(void)
16276{
16277 return PyModule_Create(&_string_module);
16278}
16279
16280
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016281#ifdef __cplusplus
16282}
16283#endif