blob: 3c79febea7788000abd53e9800c1c4652a0e3499 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnera15e2602020-04-08 02:01:56 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010047#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020048#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040049#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010050#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000051#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070052#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinnerfecc4f22019-03-19 14:20:29 +010058/* Uncomment to display statistics on interned strings at exit when
59 using Valgrind or Insecure++. */
60/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
122#define _PyUnicode_WSTR_LENGTH(op) \
123 (((PyCompactUnicodeObject*)(op))->wstr_length)
124#define _PyUnicode_LENGTH(op) \
125 (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) \
127 (((PyASCIIObject *)(op))->state)
128#define _PyUnicode_HASH(op) \
129 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_KIND(op) \
131 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200133#define _PyUnicode_GET_LENGTH(op) \
134 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200136#define _PyUnicode_DATA_ANY(op) \
137 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200138
Victor Stinner910337b2011-10-03 03:20:16 +0200139#undef PyUnicode_READY
140#define PyUnicode_READY(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200143 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100144 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200145
Victor Stinnerc379ead2011-10-03 12:52:27 +0200146#define _PyUnicode_SHARE_UTF8(op) \
147 (assert(_PyUnicode_CHECK(op)), \
148 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
149 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
150#define _PyUnicode_SHARE_WSTR(op) \
151 (assert(_PyUnicode_CHECK(op)), \
152 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
153
Victor Stinner829c0ad2011-10-03 01:08:02 +0200154/* true if the Unicode object has an allocated UTF-8 memory block
155 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200156#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200157 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200165 (!PyUnicode_IS_READY(op) || \
166 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
167
Victor Stinner910337b2011-10-03 03:20:16 +0200168/* Generic helper macro to convert characters of different types.
169 from_type and to_type have to be valid type names, begin and end
170 are pointers to the source characters which should be of type
171 "from_type *". to is a pointer of type "to_type *" and points to the
172 buffer where the result characters are written to. */
173#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
174 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100175 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600176 const from_type *_iter = (const from_type *)(begin);\
177 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200178 Py_ssize_t n = (_end) - (_iter); \
179 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200180 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200181 while (_iter < (_unrolled_end)) { \
182 _to[0] = (to_type) _iter[0]; \
183 _to[1] = (to_type) _iter[1]; \
184 _to[2] = (to_type) _iter[2]; \
185 _to[3] = (to_type) _iter[3]; \
186 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200187 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_end)) \
189 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200190 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200191
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200192#ifdef MS_WINDOWS
193 /* On Windows, overallocate by 50% is the best factor */
194# define OVERALLOCATE_FACTOR 2
195#else
196 /* On Linux, overallocate by 25% is the best factor */
197# define OVERALLOCATE_FACTOR 4
198#endif
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212
Serhiy Storchaka678db842013-01-26 12:16:36 +0200213#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 do { \
215 if (unicode_empty != NULL) \
216 Py_INCREF(unicode_empty); \
217 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200218 unicode_empty = PyUnicode_New(0, 0); \
219 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200224 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000225
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226#define _Py_RETURN_UNICODE_EMPTY() \
227 do { \
228 _Py_INCREF_UNICODE_EMPTY(); \
229 return unicode_empty; \
230 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinner59423e32018-11-26 13:40:01 +0100232static inline void
233unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
234 Py_ssize_t start, Py_ssize_t length)
235{
236 assert(0 <= start);
237 assert(kind != PyUnicode_WCHAR_KIND);
238 switch (kind) {
239 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100240 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100241 Py_UCS1 ch = (unsigned char)value;
242 Py_UCS1 *to = (Py_UCS1 *)data + start;
243 memset(to, ch, length);
244 break;
245 }
246 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100247 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100248 Py_UCS2 ch = (Py_UCS2)value;
249 Py_UCS2 *to = (Py_UCS2 *)data + start;
250 const Py_UCS2 *end = to + length;
251 for (; to < end; ++to) *to = ch;
252 break;
253 }
254 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS4 ch = value;
257 Py_UCS4 * to = (Py_UCS4 *)data + start;
258 const Py_UCS4 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 default: Py_UNREACHABLE();
263 }
264}
265
266
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700268static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200269_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900270static inline void
271_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400272static PyObject *
273unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
274 const char *errors);
275static PyObject *
276unicode_decode_utf8(const char *s, Py_ssize_t size,
277 _Py_error_handler error_handler, const char *errors,
278 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200279
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200282
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000283/* Single character Unicode strings in the Latin-1 range are being
284 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200285static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000286
Christian Heimes190d79e2008-01-30 11:58:22 +0000287/* Fast detection of the most frequent whitespace characters */
288const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x000C: * FORM FEED */
294/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 0, 1, 1, 1, 1, 1, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* case 0x001C: * FILE SEPARATOR */
298/* case 0x001D: * GROUP SEPARATOR */
299/* case 0x001E: * RECORD SEPARATOR */
300/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000302/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 1, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000307
Benjamin Peterson14339b62009-01-31 16:36:08 +0000308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000316};
317
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200320static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100321static int unicode_modifiable(PyObject *unicode);
322
Victor Stinnerfe226c02011-10-03 03:52:20 +0200323
Alexander Belopolsky40018472011-02-26 01:02:56 +0000324static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100325_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200326static PyObject *
327_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
328static PyObject *
329_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
330
331static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000332unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100334 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000335 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
336
Alexander Belopolsky40018472011-02-26 01:02:56 +0000337static void
338raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300339 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100340 PyObject *unicode,
341 Py_ssize_t startpos, Py_ssize_t endpos,
342 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000343
Christian Heimes190d79e2008-01-30 11:58:22 +0000344/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200345static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000347/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000348/* 0x000B, * LINE TABULATION */
349/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000350/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000351 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000353/* 0x001C, * FILE SEPARATOR */
354/* 0x001D, * GROUP SEPARATOR */
355/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 0, 0, 0, 0, 1, 1, 1, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000361
Benjamin Peterson14339b62009-01-31 16:36:08 +0000362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000370};
371
INADA Naoki3ae20562017-01-16 20:41:20 +0900372static int convert_uc(PyObject *obj, void *addr);
373
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300374#include "clinic/unicodeobject.c.h"
375
Victor Stinner3d4226a2018-08-29 22:21:32 +0200376_Py_error_handler
377_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200378{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
382 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200383 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200384 }
385 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200386 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 }
388 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200389 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200390 }
391 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200392 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200393 }
394 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200395 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_OTHER;
401}
402
Victor Stinner709d23d2019-05-02 14:56:30 -0400403
404static _Py_error_handler
405get_error_handler_wide(const wchar_t *errors)
406{
407 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
408 return _Py_ERROR_STRICT;
409 }
410 if (wcscmp(errors, L"surrogateescape") == 0) {
411 return _Py_ERROR_SURROGATEESCAPE;
412 }
413 if (wcscmp(errors, L"replace") == 0) {
414 return _Py_ERROR_REPLACE;
415 }
416 if (wcscmp(errors, L"ignore") == 0) {
417 return _Py_ERROR_IGNORE;
418 }
419 if (wcscmp(errors, L"backslashreplace") == 0) {
420 return _Py_ERROR_BACKSLASHREPLACE;
421 }
422 if (wcscmp(errors, L"surrogatepass") == 0) {
423 return _Py_ERROR_SURROGATEPASS;
424 }
425 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
426 return _Py_ERROR_XMLCHARREFREPLACE;
427 }
428 return _Py_ERROR_OTHER;
429}
430
431
Victor Stinner22eb6892019-06-26 00:51:05 +0200432static inline int
433unicode_check_encoding_errors(const char *encoding, const char *errors)
434{
435 if (encoding == NULL && errors == NULL) {
436 return 0;
437 }
438
439 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
440#ifndef Py_DEBUG
441 /* In release mode, only check in development mode (-X dev) */
442 if (!interp->config.dev_mode) {
443 return 0;
444 }
445#else
446 /* Always check in debug mode */
447#endif
448
449 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
450 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
451 if (!interp->fs_codec.encoding) {
452 return 0;
453 }
454
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200455 /* Disable checks during Python finalization. For example, it allows to
456 call _PyObject_Dump() during finalization for debugging purpose. */
457 if (interp->finalizing) {
458 return 0;
459 }
460
Victor Stinner22eb6892019-06-26 00:51:05 +0200461 if (encoding != NULL) {
462 PyObject *handler = _PyCodec_Lookup(encoding);
463 if (handler == NULL) {
464 return -1;
465 }
466 Py_DECREF(handler);
467 }
468
469 if (errors != NULL) {
470 PyObject *handler = PyCodec_LookupError(errors);
471 if (handler == NULL) {
472 return -1;
473 }
474 Py_DECREF(handler);
475 }
476 return 0;
477}
478
479
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300480/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
481 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000482Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000483PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000485#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000486 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000487#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 /* This is actually an illegal character, so it should
489 not be passed to unichr. */
490 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000491#endif
492}
493
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200494int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100495_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200496{
Victor Stinner68762572019-10-07 18:42:01 +0200497#define CHECK(expr) \
498 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
499
Victor Stinner910337b2011-10-03 03:20:16 +0200500 PyASCIIObject *ascii;
501 unsigned int kind;
502
Victor Stinner68762572019-10-07 18:42:01 +0200503 assert(op != NULL);
504 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200505
506 ascii = (PyASCIIObject *)op;
507 kind = ascii->state.kind;
508
Victor Stinnera3b334d2011-10-03 13:53:37 +0200509 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200510 CHECK(kind == PyUnicode_1BYTE_KIND);
511 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200512 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200513 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200514 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200515 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200516
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 if (ascii->state.compact == 1) {
518 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200519 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520 || kind == PyUnicode_2BYTE_KIND
521 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200522 CHECK(ascii->state.ascii == 0);
523 CHECK(ascii->state.ready == 1);
524 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100525 }
526 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200527 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
528
529 data = unicode->data.any;
530 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200531 CHECK(ascii->length == 0);
532 CHECK(ascii->hash == -1);
533 CHECK(ascii->state.compact == 0);
534 CHECK(ascii->state.ascii == 0);
535 CHECK(ascii->state.ready == 0);
536 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
537 CHECK(ascii->wstr != NULL);
538 CHECK(data == NULL);
539 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200540 }
541 else {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200543 || kind == PyUnicode_2BYTE_KIND
544 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200545 CHECK(ascii->state.compact == 0);
546 CHECK(ascii->state.ready == 1);
547 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200548 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200549 CHECK(compact->utf8 == data);
550 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200551 }
552 else
Victor Stinner68762572019-10-07 18:42:01 +0200553 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 }
555 }
556 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200557 if (
558#if SIZEOF_WCHAR_T == 2
559 kind == PyUnicode_2BYTE_KIND
560#else
561 kind == PyUnicode_4BYTE_KIND
562#endif
563 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200564 {
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(ascii->wstr == data);
566 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 } else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200569 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200570
571 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200572 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200576
577 /* check that the best kind is used: O(n) operation */
578 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 Py_ssize_t i;
580 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300581 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200582 Py_UCS4 ch;
583
584 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 for (i=0; i < ascii->length; i++)
586 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200587 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200588 if (ch > maxchar)
589 maxchar = ch;
590 }
591 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100592 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 128);
594 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 else
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200598 }
Victor Stinner77faf692011-11-20 18:56:05 +0100599 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(maxchar >= 0x100);
601 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100602 }
603 else {
Victor Stinner68762572019-10-07 18:42:01 +0200604 CHECK(maxchar >= 0x10000);
605 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100606 }
Victor Stinner68762572019-10-07 18:42:01 +0200607 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200608 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400609 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200610
611#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400612}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200613
Victor Stinner910337b2011-10-03 03:20:16 +0200614
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615static PyObject*
616unicode_result_wchar(PyObject *unicode)
617{
618#ifndef Py_DEBUG
619 Py_ssize_t len;
620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621 len = _PyUnicode_WSTR_LENGTH(unicode);
622 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200624 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100625 }
626
627 if (len == 1) {
628 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100629 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
631 Py_DECREF(unicode);
632 return latin1_char;
633 }
634 }
635
636 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200637 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 return NULL;
639 }
640#else
Victor Stinneraa771272012-10-04 02:32:58 +0200641 assert(Py_REFCNT(unicode) == 1);
642
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 /* don't make the result ready in debug mode to ensure that the caller
644 makes the string ready before using it */
645 assert(_PyUnicode_CheckConsistency(unicode, 1));
646#endif
647 return unicode;
648}
649
650static PyObject*
651unicode_result_ready(PyObject *unicode)
652{
653 Py_ssize_t length;
654
655 length = PyUnicode_GET_LENGTH(unicode);
656 if (length == 0) {
657 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200659 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 }
661 return unicode_empty;
662 }
663
664 if (length == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300665 const void *data = PyUnicode_DATA(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +0200666 int kind = PyUnicode_KIND(unicode);
667 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 if (ch < 256) {
669 PyObject *latin1_char = unicode_latin1[ch];
670 if (latin1_char != NULL) {
671 if (unicode != latin1_char) {
672 Py_INCREF(latin1_char);
673 Py_DECREF(unicode);
674 }
675 return latin1_char;
676 }
677 else {
678 assert(_PyUnicode_CheckConsistency(unicode, 1));
679 Py_INCREF(unicode);
680 unicode_latin1[ch] = unicode;
681 return unicode;
682 }
683 }
684 }
685
686 assert(_PyUnicode_CheckConsistency(unicode, 1));
687 return unicode;
688}
689
690static PyObject*
691unicode_result(PyObject *unicode)
692{
693 assert(_PyUnicode_CHECK(unicode));
694 if (PyUnicode_IS_READY(unicode))
695 return unicode_result_ready(unicode);
696 else
697 return unicode_result_wchar(unicode);
698}
699
Victor Stinnerc4b49542011-12-11 22:44:26 +0100700static PyObject*
701unicode_result_unchanged(PyObject *unicode)
702{
703 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500704 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705 return NULL;
706 Py_INCREF(unicode);
707 return unicode;
708 }
709 else
710 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100711 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100712}
713
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
715 ASCII, Latin1, UTF-8, etc. */
716static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200717backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200718 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
719{
Victor Stinnerad771582015-10-09 12:38:53 +0200720 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200721 Py_UCS4 ch;
722 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300723 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200724
725 assert(PyUnicode_IS_READY(unicode));
726 kind = PyUnicode_KIND(unicode);
727 data = PyUnicode_DATA(unicode);
728
729 size = 0;
730 /* determine replacement size */
731 for (i = collstart; i < collend; ++i) {
732 Py_ssize_t incr;
733
734 ch = PyUnicode_READ(kind, data, i);
735 if (ch < 0x100)
736 incr = 2+2;
737 else if (ch < 0x10000)
738 incr = 2+4;
739 else {
740 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200741 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200742 }
743 if (size > PY_SSIZE_T_MAX - incr) {
744 PyErr_SetString(PyExc_OverflowError,
745 "encoded result is too long for a Python string");
746 return NULL;
747 }
748 size += incr;
749 }
750
Victor Stinnerad771582015-10-09 12:38:53 +0200751 str = _PyBytesWriter_Prepare(writer, str, size);
752 if (str == NULL)
753 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200754
755 /* generate replacement */
756 for (i = collstart; i < collend; ++i) {
757 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200758 *str++ = '\\';
759 if (ch >= 0x00010000) {
760 *str++ = 'U';
761 *str++ = Py_hexdigits[(ch>>28)&0xf];
762 *str++ = Py_hexdigits[(ch>>24)&0xf];
763 *str++ = Py_hexdigits[(ch>>20)&0xf];
764 *str++ = Py_hexdigits[(ch>>16)&0xf];
765 *str++ = Py_hexdigits[(ch>>12)&0xf];
766 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200767 }
Victor Stinner797485e2015-10-09 03:17:30 +0200768 else if (ch >= 0x100) {
769 *str++ = 'u';
770 *str++ = Py_hexdigits[(ch>>12)&0xf];
771 *str++ = Py_hexdigits[(ch>>8)&0xf];
772 }
773 else
774 *str++ = 'x';
775 *str++ = Py_hexdigits[(ch>>4)&0xf];
776 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
778 return str;
779}
780
781/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
782 ASCII, Latin1, UTF-8, etc. */
783static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200784xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200785 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
786{
Victor Stinnerad771582015-10-09 12:38:53 +0200787 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200788 Py_UCS4 ch;
789 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300790 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200791
792 assert(PyUnicode_IS_READY(unicode));
793 kind = PyUnicode_KIND(unicode);
794 data = PyUnicode_DATA(unicode);
795
796 size = 0;
797 /* determine replacement size */
798 for (i = collstart; i < collend; ++i) {
799 Py_ssize_t incr;
800
801 ch = PyUnicode_READ(kind, data, i);
802 if (ch < 10)
803 incr = 2+1+1;
804 else if (ch < 100)
805 incr = 2+2+1;
806 else if (ch < 1000)
807 incr = 2+3+1;
808 else if (ch < 10000)
809 incr = 2+4+1;
810 else if (ch < 100000)
811 incr = 2+5+1;
812 else if (ch < 1000000)
813 incr = 2+6+1;
814 else {
815 assert(ch <= MAX_UNICODE);
816 incr = 2+7+1;
817 }
818 if (size > PY_SSIZE_T_MAX - incr) {
819 PyErr_SetString(PyExc_OverflowError,
820 "encoded result is too long for a Python string");
821 return NULL;
822 }
823 size += incr;
824 }
825
Victor Stinnerad771582015-10-09 12:38:53 +0200826 str = _PyBytesWriter_Prepare(writer, str, size);
827 if (str == NULL)
828 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200829
830 /* generate replacement */
831 for (i = collstart; i < collend; ++i) {
832 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
833 }
834 return str;
835}
836
Thomas Wouters477c8d52006-05-27 19:21:47 +0000837/* --- Bloom Filters ----------------------------------------------------- */
838
839/* stuff to implement simple "bloom filters" for Unicode characters.
840 to keep things simple, we use a single bitmask, using the least 5
841 bits from each unicode characters as the bit index. */
842
843/* the linebreak mask is set up by Unicode_Init below */
844
Antoine Pitrouf068f942010-01-13 14:19:12 +0000845#if LONG_BIT >= 128
846#define BLOOM_WIDTH 128
847#elif LONG_BIT >= 64
848#define BLOOM_WIDTH 64
849#elif LONG_BIT >= 32
850#define BLOOM_WIDTH 32
851#else
852#error "LONG_BIT is smaller than 32"
853#endif
854
Thomas Wouters477c8d52006-05-27 19:21:47 +0000855#define BLOOM_MASK unsigned long
856
Serhiy Storchaka05997252013-01-26 12:14:02 +0200857static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000858
Antoine Pitrouf068f942010-01-13 14:19:12 +0000859#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860
Benjamin Peterson29060642009-01-31 22:14:21 +0000861#define BLOOM_LINEBREAK(ch) \
862 ((ch) < 128U ? ascii_linebreak[(ch)] : \
863 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000864
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700865static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300866make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000867{
Victor Stinnera85af502013-04-09 21:53:54 +0200868#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
869 do { \
870 TYPE *data = (TYPE *)PTR; \
871 TYPE *end = data + LEN; \
872 Py_UCS4 ch; \
873 for (; data != end; data++) { \
874 ch = *data; \
875 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
876 } \
877 break; \
878 } while (0)
879
Thomas Wouters477c8d52006-05-27 19:21:47 +0000880 /* calculate simple bloom-style bitmask for a given unicode string */
881
Antoine Pitrouf068f942010-01-13 14:19:12 +0000882 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000883
884 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200885 switch (kind) {
886 case PyUnicode_1BYTE_KIND:
887 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
888 break;
889 case PyUnicode_2BYTE_KIND:
890 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
891 break;
892 case PyUnicode_4BYTE_KIND:
893 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
894 break;
895 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700896 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200897 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000898 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200899
900#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000901}
902
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903static int
904ensure_unicode(PyObject *obj)
905{
906 if (!PyUnicode_Check(obj)) {
907 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200908 "must be str, not %.100s",
909 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300910 return -1;
911 }
912 return PyUnicode_READY(obj);
913}
914
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200915/* Compilation of templated routines */
916
917#include "stringlib/asciilib.h"
918#include "stringlib/fastsearch.h"
919#include "stringlib/partition.h"
920#include "stringlib/split.h"
921#include "stringlib/count.h"
922#include "stringlib/find.h"
923#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200924#include "stringlib/undef.h"
925
926#include "stringlib/ucs1lib.h"
927#include "stringlib/fastsearch.h"
928#include "stringlib/partition.h"
929#include "stringlib/split.h"
930#include "stringlib/count.h"
931#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300932#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200934#include "stringlib/undef.h"
935
936#include "stringlib/ucs2lib.h"
937#include "stringlib/fastsearch.h"
938#include "stringlib/partition.h"
939#include "stringlib/split.h"
940#include "stringlib/count.h"
941#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300942#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200944#include "stringlib/undef.h"
945
946#include "stringlib/ucs4lib.h"
947#include "stringlib/fastsearch.h"
948#include "stringlib/partition.h"
949#include "stringlib/split.h"
950#include "stringlib/count.h"
951#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300952#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200953#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200954#include "stringlib/undef.h"
955
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200956#include "stringlib/unicodedefs.h"
957#include "stringlib/fastsearch.h"
958#include "stringlib/count.h"
959#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100960#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200961
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962/* --- Unicode Object ----------------------------------------------------- */
963
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700964static inline Py_ssize_t
965findchar(const void *s, int kind,
966 Py_ssize_t size, Py_UCS4 ch,
967 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200969 switch (kind) {
970 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS1) ch != ch)
972 return -1;
973 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600974 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200975 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600976 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if ((Py_UCS2) ch != ch)
979 return -1;
980 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600981 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200982 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600983 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200984 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200985 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600986 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200987 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600988 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200989 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700990 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992}
993
Victor Stinnerafffce42012-10-03 23:03:17 +0200994#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000995/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200996 earlier.
997
998 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
999 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1000 invalid character in Unicode 6.0. */
1001static void
1002unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1003{
1004 int kind = PyUnicode_KIND(unicode);
1005 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1006 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1007 if (length <= old_length)
1008 return;
1009 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1010}
1011#endif
1012
Victor Stinnerfe226c02011-10-03 03:52:20 +02001013static PyObject*
1014resize_compact(PyObject *unicode, Py_ssize_t length)
1015{
1016 Py_ssize_t char_size;
1017 Py_ssize_t struct_size;
1018 Py_ssize_t new_size;
1019 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001020 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001021#ifdef Py_DEBUG
1022 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1023#endif
1024
Victor Stinner79891572012-05-03 13:43:07 +02001025 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001026 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001027 assert(PyUnicode_IS_COMPACT(unicode));
1028
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001029 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001030 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001031 struct_size = sizeof(PyASCIIObject);
1032 else
1033 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001034 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1037 PyErr_NoMemory();
1038 return NULL;
1039 }
1040 new_size = (struct_size + (length + 1) * char_size);
1041
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001042 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1043 PyObject_DEL(_PyUnicode_UTF8(unicode));
1044 _PyUnicode_UTF8(unicode) = NULL;
1045 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1046 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001047#ifdef Py_REF_DEBUG
1048 _Py_RefTotal--;
1049#endif
1050#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001051 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001052#endif
Victor Stinner84def372011-12-11 20:04:56 +01001053
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001054 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001055 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001056 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 PyErr_NoMemory();
1058 return NULL;
1059 }
Victor Stinner84def372011-12-11 20:04:56 +01001060 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001061 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001066 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001067 _PyUnicode_WSTR_LENGTH(unicode) = length;
1068 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001069 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1070 PyObject_DEL(_PyUnicode_WSTR(unicode));
1071 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001072 if (!PyUnicode_IS_ASCII(unicode))
1073 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001074 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001075#ifdef Py_DEBUG
1076 unicode_fill_invalid(unicode, old_length);
1077#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1079 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001080 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 return unicode;
1082}
1083
Alexander Belopolsky40018472011-02-26 01:02:56 +00001084static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001085resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086{
Victor Stinner95663112011-10-04 01:03:50 +02001087 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001088 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001090 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001091
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092 if (PyUnicode_IS_READY(unicode)) {
1093 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001094 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001096#ifdef Py_DEBUG
1097 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1098#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099
1100 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001101 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001102 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1103 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001104
1105 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1106 PyErr_NoMemory();
1107 return -1;
1108 }
1109 new_size = (length + 1) * char_size;
1110
Victor Stinner7a9105a2011-12-12 00:13:42 +01001111 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1112 {
1113 PyObject_DEL(_PyUnicode_UTF8(unicode));
1114 _PyUnicode_UTF8(unicode) = NULL;
1115 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1116 }
1117
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 data = (PyObject *)PyObject_REALLOC(data, new_size);
1119 if (data == NULL) {
1120 PyErr_NoMemory();
1121 return -1;
1122 }
1123 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001124 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001126 _PyUnicode_WSTR_LENGTH(unicode) = length;
1127 }
1128 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001129 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001130 _PyUnicode_UTF8_LENGTH(unicode) = length;
1131 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 _PyUnicode_LENGTH(unicode) = length;
1133 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001134#ifdef Py_DEBUG
1135 unicode_fill_invalid(unicode, old_length);
1136#endif
Victor Stinner95663112011-10-04 01:03:50 +02001137 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001138 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001140 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 }
Victor Stinner95663112011-10-04 01:03:50 +02001142 assert(_PyUnicode_WSTR(unicode) != NULL);
1143
1144 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001145 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001146 PyErr_NoMemory();
1147 return -1;
1148 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001149 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001150 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001151 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001152 if (!wstr) {
1153 PyErr_NoMemory();
1154 return -1;
1155 }
1156 _PyUnicode_WSTR(unicode) = wstr;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
1158 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001159 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return 0;
1161}
1162
Victor Stinnerfe226c02011-10-03 03:52:20 +02001163static PyObject*
1164resize_copy(PyObject *unicode, Py_ssize_t length)
1165{
1166 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001167 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001168 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001169
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001170 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171
1172 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1173 if (copy == NULL)
1174 return NULL;
1175
1176 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001177 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001178 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001179 }
1180 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001181 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001182
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 if (w == NULL)
1185 return NULL;
1186 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1187 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001188 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001189 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001190 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001191 }
1192}
1193
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001195 Ux0000 terminated; some code (e.g. new_identifier)
1196 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197
1198 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001199 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200
1201*/
1202
Alexander Belopolsky40018472011-02-26 01:02:56 +00001203static PyUnicodeObject *
1204_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001206 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208
Thomas Wouters477c8d52006-05-27 19:21:47 +00001209 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 if (length == 0 && unicode_empty != NULL) {
1211 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001212 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 }
1214
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001215 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001216 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001217 return (PyUnicodeObject *)PyErr_NoMemory();
1218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 if (length < 0) {
1220 PyErr_SetString(PyExc_SystemError,
1221 "Negative size passed to _PyUnicode_New");
1222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 }
1224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001225 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1226 if (unicode == NULL)
1227 return NULL;
1228 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001229
1230 _PyUnicode_WSTR_LENGTH(unicode) = length;
1231 _PyUnicode_HASH(unicode) = -1;
1232 _PyUnicode_STATE(unicode).interned = 0;
1233 _PyUnicode_STATE(unicode).kind = 0;
1234 _PyUnicode_STATE(unicode).compact = 0;
1235 _PyUnicode_STATE(unicode).ready = 0;
1236 _PyUnicode_STATE(unicode).ascii = 0;
1237 _PyUnicode_DATA_ANY(unicode) = NULL;
1238 _PyUnicode_LENGTH(unicode) = 0;
1239 _PyUnicode_UTF8(unicode) = NULL;
1240 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1243 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001244 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001245 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001246 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248
Jeremy Hyltond8082792003-09-16 19:41:39 +00001249 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001250 * the caller fails before initializing str -- unicode_resize()
1251 * reads str[0], and the Keep-Alive optimization can keep memory
1252 * allocated for str alive across a call to unicode_dealloc(unicode).
1253 * We don't want unicode_resize to read uninitialized memory in
1254 * that case.
1255 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256 _PyUnicode_WSTR(unicode)[0] = 0;
1257 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001258
Victor Stinner7931d9a2011-11-04 00:22:48 +01001259 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 return unicode;
1261}
1262
Victor Stinnerf42dc442011-10-02 23:33:16 +02001263static const char*
1264unicode_kind_name(PyObject *unicode)
1265{
Victor Stinner42dfd712011-10-03 14:41:45 +02001266 /* don't check consistency: unicode_kind_name() is called from
1267 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001268 if (!PyUnicode_IS_COMPACT(unicode))
1269 {
1270 if (!PyUnicode_IS_READY(unicode))
1271 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001272 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001273 {
1274 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001275 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 return "legacy ascii";
1277 else
1278 return "legacy latin1";
1279 case PyUnicode_2BYTE_KIND:
1280 return "legacy UCS2";
1281 case PyUnicode_4BYTE_KIND:
1282 return "legacy UCS4";
1283 default:
1284 return "<legacy invalid kind>";
1285 }
1286 }
1287 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 return "ascii";
1292 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001293 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001294 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001295 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001296 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001297 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001298 default:
1299 return "<invalid compact kind>";
1300 }
1301}
1302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001305const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001306 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001307 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308}
1309
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001310const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001311 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 return _PyUnicode_COMPACT_DATA(unicode);
1313}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001314const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001315 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001316 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1318 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1319 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1320 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1321 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1322 return PyUnicode_DATA(unicode);
1323}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001324
1325void
1326_PyUnicode_Dump(PyObject *op)
1327{
1328 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001329 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1330 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001331 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001332
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001334 {
1335 if (ascii->state.ascii)
1336 data = (ascii + 1);
1337 else
1338 data = (compact + 1);
1339 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001340 else
1341 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001342 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1343 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001344
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 if (ascii->wstr == data)
1346 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera3b334d2011-10-03 13:53:37 +02001349 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001350 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001351 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1352 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001353 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001354 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001357}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358#endif
1359
1360PyObject *
1361PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1362{
1363 PyObject *obj;
1364 PyCompactUnicodeObject *unicode;
1365 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001366 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001367 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 Py_ssize_t char_size;
1369 Py_ssize_t struct_size;
1370
1371 /* Optimization for empty strings */
1372 if (size == 0 && unicode_empty != NULL) {
1373 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001374 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 }
1376
Victor Stinner9e9d6892011-10-04 01:02:02 +02001377 is_ascii = 0;
1378 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379 struct_size = sizeof(PyCompactUnicodeObject);
1380 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001381 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 char_size = 1;
1383 is_ascii = 1;
1384 struct_size = sizeof(PyASCIIObject);
1385 }
1386 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001387 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 char_size = 1;
1389 }
1390 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001391 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 char_size = 2;
1393 if (sizeof(wchar_t) == 2)
1394 is_sharing = 1;
1395 }
1396 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001397 if (maxchar > MAX_UNICODE) {
1398 PyErr_SetString(PyExc_SystemError,
1399 "invalid maximum character passed to PyUnicode_New");
1400 return NULL;
1401 }
Victor Stinner8f825062012-04-27 13:55:39 +02001402 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 char_size = 4;
1404 if (sizeof(wchar_t) == 4)
1405 is_sharing = 1;
1406 }
1407
1408 /* Ensure we won't overflow the size. */
1409 if (size < 0) {
1410 PyErr_SetString(PyExc_SystemError,
1411 "Negative size passed to PyUnicode_New");
1412 return NULL;
1413 }
1414 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1415 return PyErr_NoMemory();
1416
1417 /* Duplicated allocation code from _PyObject_New() instead of a call to
1418 * PyObject_New() so we are able to allocate space for the object and
1419 * it's data buffer.
1420 */
1421 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1422 if (obj == NULL)
1423 return PyErr_NoMemory();
1424 obj = PyObject_INIT(obj, &PyUnicode_Type);
1425 if (obj == NULL)
1426 return NULL;
1427
1428 unicode = (PyCompactUnicodeObject *)obj;
1429 if (is_ascii)
1430 data = ((PyASCIIObject*)obj) + 1;
1431 else
1432 data = unicode + 1;
1433 _PyUnicode_LENGTH(unicode) = size;
1434 _PyUnicode_HASH(unicode) = -1;
1435 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001436 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 _PyUnicode_STATE(unicode).compact = 1;
1438 _PyUnicode_STATE(unicode).ready = 1;
1439 _PyUnicode_STATE(unicode).ascii = is_ascii;
1440 if (is_ascii) {
1441 ((char*)data)[size] = 0;
1442 _PyUnicode_WSTR(unicode) = NULL;
1443 }
Victor Stinner8f825062012-04-27 13:55:39 +02001444 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 ((char*)data)[size] = 0;
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001449 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 else {
1452 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001453 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001454 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001456 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 ((Py_UCS4*)data)[size] = 0;
1458 if (is_sharing) {
1459 _PyUnicode_WSTR_LENGTH(unicode) = size;
1460 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1461 }
1462 else {
1463 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1464 _PyUnicode_WSTR(unicode) = NULL;
1465 }
1466 }
Victor Stinner8f825062012-04-27 13:55:39 +02001467#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001468 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001469#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001470 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 return obj;
1472}
1473
1474#if SIZEOF_WCHAR_T == 2
1475/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1476 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001477 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478
1479 This function assumes that unicode can hold one more code point than wstr
1480 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001481static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001483 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484{
1485 const wchar_t *iter;
1486 Py_UCS4 *ucs4_out;
1487
Victor Stinner910337b2011-10-03 03:20:16 +02001488 assert(unicode != NULL);
1489 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1491 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1492
1493 for (iter = begin; iter < end; ) {
1494 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1495 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001496 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1497 && (iter+1) < end
1498 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 {
Victor Stinner551ac952011-11-29 22:58:13 +01001500 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 iter += 2;
1502 }
1503 else {
1504 *ucs4_out++ = *iter;
1505 iter++;
1506 }
1507 }
1508 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1509 _PyUnicode_GET_LENGTH(unicode)));
1510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511}
1512#endif
1513
Victor Stinnercd9950f2011-10-02 00:34:53 +02001514static int
Victor Stinner488fa492011-12-12 00:01:39 +01001515unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001516{
Victor Stinner488fa492011-12-12 00:01:39 +01001517 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001518 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001519 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001520 return -1;
1521 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001522 return 0;
1523}
1524
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001525static int
1526_copy_characters(PyObject *to, Py_ssize_t to_start,
1527 PyObject *from, Py_ssize_t from_start,
1528 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001531 const void *from_data;
1532 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533
Victor Stinneree4544c2012-05-09 22:24:08 +02001534 assert(0 <= how_many);
1535 assert(0 <= from_start);
1536 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001537 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001539 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540
Victor Stinnerd3f08822012-05-29 12:57:52 +02001541 assert(PyUnicode_Check(to));
1542 assert(PyUnicode_IS_READY(to));
1543 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1544
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001545 if (how_many == 0)
1546 return 0;
1547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001549 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552
Victor Stinnerf1852262012-06-16 16:38:26 +02001553#ifdef Py_DEBUG
1554 if (!check_maxchar
1555 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1556 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001557 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001558 Py_UCS4 ch;
1559 Py_ssize_t i;
1560 for (i=0; i < how_many; i++) {
1561 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1562 assert(ch <= to_maxchar);
1563 }
1564 }
1565#endif
1566
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001567 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001568 if (check_maxchar
1569 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1570 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001571 /* Writing Latin-1 characters into an ASCII string requires to
1572 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001573 Py_UCS4 max_char;
1574 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001575 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001576 if (max_char >= 128)
1577 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001578 }
Christian Heimesf051e432016-09-13 20:22:02 +02001579 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001580 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001581 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001583 else if (from_kind == PyUnicode_1BYTE_KIND
1584 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001585 {
1586 _PyUnicode_CONVERT_BYTES(
1587 Py_UCS1, Py_UCS2,
1588 PyUnicode_1BYTE_DATA(from) + from_start,
1589 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1590 PyUnicode_2BYTE_DATA(to) + to_start
1591 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001592 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001593 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001594 && to_kind == PyUnicode_4BYTE_KIND)
1595 {
1596 _PyUnicode_CONVERT_BYTES(
1597 Py_UCS1, Py_UCS4,
1598 PyUnicode_1BYTE_DATA(from) + from_start,
1599 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1600 PyUnicode_4BYTE_DATA(to) + to_start
1601 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001602 }
1603 else if (from_kind == PyUnicode_2BYTE_KIND
1604 && to_kind == PyUnicode_4BYTE_KIND)
1605 {
1606 _PyUnicode_CONVERT_BYTES(
1607 Py_UCS2, Py_UCS4,
1608 PyUnicode_2BYTE_DATA(from) + from_start,
1609 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1610 PyUnicode_4BYTE_DATA(to) + to_start
1611 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001612 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001613 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001614 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1615
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001616 if (!check_maxchar) {
1617 if (from_kind == PyUnicode_2BYTE_KIND
1618 && to_kind == PyUnicode_1BYTE_KIND)
1619 {
1620 _PyUnicode_CONVERT_BYTES(
1621 Py_UCS2, Py_UCS1,
1622 PyUnicode_2BYTE_DATA(from) + from_start,
1623 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1624 PyUnicode_1BYTE_DATA(to) + to_start
1625 );
1626 }
1627 else if (from_kind == PyUnicode_4BYTE_KIND
1628 && to_kind == PyUnicode_1BYTE_KIND)
1629 {
1630 _PyUnicode_CONVERT_BYTES(
1631 Py_UCS4, Py_UCS1,
1632 PyUnicode_4BYTE_DATA(from) + from_start,
1633 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1634 PyUnicode_1BYTE_DATA(to) + to_start
1635 );
1636 }
1637 else if (from_kind == PyUnicode_4BYTE_KIND
1638 && to_kind == PyUnicode_2BYTE_KIND)
1639 {
1640 _PyUnicode_CONVERT_BYTES(
1641 Py_UCS4, Py_UCS2,
1642 PyUnicode_4BYTE_DATA(from) + from_start,
1643 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1644 PyUnicode_2BYTE_DATA(to) + to_start
1645 );
1646 }
1647 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001648 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001649 }
1650 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001651 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001653 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 Py_ssize_t i;
1655
Victor Stinnera0702ab2011-09-29 14:14:38 +02001656 for (i=0; i < how_many; i++) {
1657 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001658 if (ch > to_maxchar)
1659 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001660 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1661 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001662 }
1663 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001664 return 0;
1665}
1666
Victor Stinnerd3f08822012-05-29 12:57:52 +02001667void
1668_PyUnicode_FastCopyCharacters(
1669 PyObject *to, Py_ssize_t to_start,
1670 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001671{
1672 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1673}
1674
1675Py_ssize_t
1676PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1677 PyObject *from, Py_ssize_t from_start,
1678 Py_ssize_t how_many)
1679{
1680 int err;
1681
1682 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1683 PyErr_BadInternalCall();
1684 return -1;
1685 }
1686
Benjamin Petersonbac79492012-01-14 13:34:47 -05001687 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001688 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001689 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001690 return -1;
1691
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001692 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001693 PyErr_SetString(PyExc_IndexError, "string index out of range");
1694 return -1;
1695 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001696 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001697 PyErr_SetString(PyExc_IndexError, "string index out of range");
1698 return -1;
1699 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001700 if (how_many < 0) {
1701 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1702 return -1;
1703 }
1704 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1706 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001707 "Cannot write %zi characters at %zi "
1708 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 how_many, to_start, PyUnicode_GET_LENGTH(to));
1710 return -1;
1711 }
1712
1713 if (how_many == 0)
1714 return 0;
1715
Victor Stinner488fa492011-12-12 00:01:39 +01001716 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001717 return -1;
1718
1719 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1720 if (err) {
1721 PyErr_Format(PyExc_SystemError,
1722 "Cannot copy %s characters "
1723 "into a string of %s characters",
1724 unicode_kind_name(from),
1725 unicode_kind_name(to));
1726 return -1;
1727 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001728 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729}
1730
Victor Stinner17222162011-09-28 22:15:37 +02001731/* Find the maximum code point and count the number of surrogate pairs so a
1732 correct string length can be computed before converting a string to UCS4.
1733 This function counts single surrogates as a character and not as a pair.
1734
1735 Return 0 on success, or -1 on error. */
1736static int
1737find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1738 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739{
1740 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001741 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742
Victor Stinnerc53be962011-10-02 21:33:54 +02001743 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 *num_surrogates = 0;
1745 *maxchar = 0;
1746
1747 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001749 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1750 && (iter+1) < end
1751 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1752 {
1753 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1754 ++(*num_surrogates);
1755 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 }
1757 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001759 {
1760 ch = *iter;
1761 iter++;
1762 }
1763 if (ch > *maxchar) {
1764 *maxchar = ch;
1765 if (*maxchar > MAX_UNICODE) {
1766 PyErr_Format(PyExc_ValueError,
1767 "character U+%x is not in range [U+0000; U+10ffff]",
1768 ch);
1769 return -1;
1770 }
1771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772 }
1773 return 0;
1774}
1775
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001776int
1777_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778{
1779 wchar_t *end;
1780 Py_UCS4 maxchar = 0;
1781 Py_ssize_t num_surrogates;
1782#if SIZEOF_WCHAR_T == 2
1783 Py_ssize_t length_wo_surrogates;
1784#endif
1785
Georg Brandl7597add2011-10-05 16:36:47 +02001786 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 strings were created using _PyObject_New() and where no canonical
1788 representation (the str field) has been set yet aka strings
1789 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001790 assert(_PyUnicode_CHECK(unicode));
1791 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001793 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001795 /* Actually, it should neither be interned nor be anything else: */
1796 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001799 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001800 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802
1803 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001804 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1805 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 PyErr_NoMemory();
1807 return -1;
1808 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001809 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 _PyUnicode_WSTR(unicode), end,
1811 PyUnicode_1BYTE_DATA(unicode));
1812 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1813 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1814 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1815 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001816 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001817 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001818 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 }
1820 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001821 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001822 _PyUnicode_UTF8(unicode) = NULL;
1823 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 }
1825 PyObject_FREE(_PyUnicode_WSTR(unicode));
1826 _PyUnicode_WSTR(unicode) = NULL;
1827 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1828 }
1829 /* In this case we might have to convert down from 4-byte native
1830 wchar_t to 2-byte unicode. */
1831 else if (maxchar < 65536) {
1832 assert(num_surrogates == 0 &&
1833 "FindMaxCharAndNumSurrogatePairs() messed up");
1834
Victor Stinner506f5922011-09-28 22:34:18 +02001835#if SIZEOF_WCHAR_T == 2
1836 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001838 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1839 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1840 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001841 _PyUnicode_UTF8(unicode) = NULL;
1842 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001843#else
1844 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001845 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001846 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001847 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001848 PyErr_NoMemory();
1849 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850 }
Victor Stinner506f5922011-09-28 22:34:18 +02001851 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1852 _PyUnicode_WSTR(unicode), end,
1853 PyUnicode_2BYTE_DATA(unicode));
1854 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1855 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1856 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001857 _PyUnicode_UTF8(unicode) = NULL;
1858 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001859 PyObject_FREE(_PyUnicode_WSTR(unicode));
1860 _PyUnicode_WSTR(unicode) = NULL;
1861 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1862#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 }
1864 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1865 else {
1866#if SIZEOF_WCHAR_T == 2
1867 /* in case the native representation is 2-bytes, we need to allocate a
1868 new normalized 4-byte version. */
1869 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001870 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1871 PyErr_NoMemory();
1872 return -1;
1873 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001874 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1875 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001876 PyErr_NoMemory();
1877 return -1;
1878 }
1879 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1880 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001881 _PyUnicode_UTF8(unicode) = NULL;
1882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001883 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1884 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001885 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 PyObject_FREE(_PyUnicode_WSTR(unicode));
1887 _PyUnicode_WSTR(unicode) = NULL;
1888 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1889#else
1890 assert(num_surrogates == 0);
1891
Victor Stinnerc3c74152011-10-02 20:39:55 +02001892 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001894 _PyUnicode_UTF8(unicode) = NULL;
1895 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1897#endif
1898 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1899 }
1900 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001901 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 return 0;
1903}
1904
Alexander Belopolsky40018472011-02-26 01:02:56 +00001905static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001906unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907{
Walter Dörwald16807132007-05-25 13:52:07 +00001908 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001909 case SSTATE_NOT_INTERNED:
1910 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001911
Benjamin Peterson29060642009-01-31 22:14:21 +00001912 case SSTATE_INTERNED_MORTAL:
1913 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001914 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001915 if (PyDict_DelItem(interned, unicode) != 0) {
1916 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1917 NULL);
1918 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001919 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001920
Benjamin Peterson29060642009-01-31 22:14:21 +00001921 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001922 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1923 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001924
Benjamin Peterson29060642009-01-31 22:14:21 +00001925 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001926 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001927 }
1928
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001931 }
1932 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001933 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001934 }
1935 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001936 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001939 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940}
1941
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001942#ifdef Py_DEBUG
1943static int
1944unicode_is_singleton(PyObject *unicode)
1945{
1946 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1947 if (unicode == unicode_empty)
1948 return 1;
1949 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1950 {
1951 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1952 if (ch < 256 && unicode_latin1[ch] == unicode)
1953 return 1;
1954 }
1955 return 0;
1956}
1957#endif
1958
Alexander Belopolsky40018472011-02-26 01:02:56 +00001959static int
Victor Stinner488fa492011-12-12 00:01:39 +01001960unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001961{
Victor Stinner488fa492011-12-12 00:01:39 +01001962 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001963 if (Py_REFCNT(unicode) != 1)
1964 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001965 if (_PyUnicode_HASH(unicode) != -1)
1966 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 if (PyUnicode_CHECK_INTERNED(unicode))
1968 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001969 if (!PyUnicode_CheckExact(unicode))
1970 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001971#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001972 /* singleton refcount is greater than 1 */
1973 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001974#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001975 return 1;
1976}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001977
Victor Stinnerfe226c02011-10-03 03:52:20 +02001978static int
1979unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1980{
1981 PyObject *unicode;
1982 Py_ssize_t old_length;
1983
1984 assert(p_unicode != NULL);
1985 unicode = *p_unicode;
1986
1987 assert(unicode != NULL);
1988 assert(PyUnicode_Check(unicode));
1989 assert(0 <= length);
1990
Victor Stinner910337b2011-10-03 03:20:16 +02001991 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001992 old_length = PyUnicode_WSTR_LENGTH(unicode);
1993 else
1994 old_length = PyUnicode_GET_LENGTH(unicode);
1995 if (old_length == length)
1996 return 0;
1997
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001998 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 _Py_INCREF_UNICODE_EMPTY();
2000 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002002 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002003 return 0;
2004 }
2005
Victor Stinner488fa492011-12-12 00:01:39 +01002006 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002007 PyObject *copy = resize_copy(unicode, length);
2008 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002010 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002011 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002012 }
2013
Victor Stinnerfe226c02011-10-03 03:52:20 +02002014 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002015 PyObject *new_unicode = resize_compact(unicode, length);
2016 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002018 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002020 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002021 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002022}
2023
Alexander Belopolsky40018472011-02-26 01:02:56 +00002024int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002025PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002026{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002027 PyObject *unicode;
2028 if (p_unicode == NULL) {
2029 PyErr_BadInternalCall();
2030 return -1;
2031 }
2032 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002033 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002034 {
2035 PyErr_BadInternalCall();
2036 return -1;
2037 }
2038 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002039}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002040
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002041/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002042
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002043 WARNING: The function doesn't copy the terminating null character and
2044 doesn't check the maximum character (may write a latin1 character in an
2045 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002046static void
2047unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2048 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002049{
2050 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002051 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002052 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002053
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002054 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002055 switch (kind) {
2056 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002057#ifdef Py_DEBUG
2058 if (PyUnicode_IS_ASCII(unicode)) {
2059 Py_UCS4 maxchar = ucs1lib_find_max_char(
2060 (const Py_UCS1*)str,
2061 (const Py_UCS1*)str + len);
2062 assert(maxchar < 128);
2063 }
2064#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002065 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002066 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002067 }
2068 case PyUnicode_2BYTE_KIND: {
2069 Py_UCS2 *start = (Py_UCS2 *)data + index;
2070 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002071
Victor Stinner184252a2012-06-16 02:57:41 +02002072 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002073 *ucs2 = (Py_UCS2)*str;
2074
2075 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002076 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002077 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002078 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002079 Py_UCS4 *start = (Py_UCS4 *)data + index;
2080 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002081
Victor Stinner184252a2012-06-16 02:57:41 +02002082 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002083 *ucs4 = (Py_UCS4)*str;
2084
2085 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002086 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002087 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002088 default:
2089 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002090 }
2091}
2092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093static PyObject*
2094get_latin1_char(unsigned char ch)
2095{
Victor Stinnera464fc12011-10-02 20:39:30 +02002096 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002098 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 if (!unicode)
2100 return NULL;
2101 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002102 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103 unicode_latin1[ch] = unicode;
2104 }
2105 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002106 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107}
2108
Victor Stinner985a82a2014-01-03 12:53:47 +01002109static PyObject*
2110unicode_char(Py_UCS4 ch)
2111{
2112 PyObject *unicode;
2113
2114 assert(ch <= MAX_UNICODE);
2115
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002116 if (ch < 256)
2117 return get_latin1_char(ch);
2118
Victor Stinner985a82a2014-01-03 12:53:47 +01002119 unicode = PyUnicode_New(1, ch);
2120 if (unicode == NULL)
2121 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002122
2123 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2124 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002125 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002126 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002127 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2128 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2129 }
2130 assert(_PyUnicode_CheckConsistency(unicode, 1));
2131 return unicode;
2132}
2133
Alexander Belopolsky40018472011-02-26 01:02:56 +00002134PyObject *
2135PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002137 if (u == NULL)
2138 return (PyObject*)_PyUnicode_New(size);
2139
2140 if (size < 0) {
2141 PyErr_BadInternalCall();
2142 return NULL;
2143 }
2144
2145 return PyUnicode_FromWideChar(u, size);
2146}
2147
2148PyObject *
2149PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2150{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002151 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 Py_UCS4 maxchar = 0;
2153 Py_ssize_t num_surrogates;
2154
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002155 if (u == NULL && size != 0) {
2156 PyErr_BadInternalCall();
2157 return NULL;
2158 }
2159
2160 if (size == -1) {
2161 size = wcslen(u);
2162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002164 /* If the Unicode data is known at construction time, we can apply
2165 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002168 if (size == 0)
2169 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 /* Single character Unicode objects in the Latin-1 range are
2172 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002173 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 return get_latin1_char((unsigned char)*u);
2175
2176 /* If not empty and not single character, copy the Unicode data
2177 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002178 if (find_maxchar_surrogates(u, u + size,
2179 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 return NULL;
2181
Victor Stinner8faf8212011-12-08 22:14:11 +01002182 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 if (!unicode)
2184 return NULL;
2185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 switch (PyUnicode_KIND(unicode)) {
2187 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002188 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2190 break;
2191 case PyUnicode_2BYTE_KIND:
2192#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002193 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002195 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2197#endif
2198 break;
2199 case PyUnicode_4BYTE_KIND:
2200#if SIZEOF_WCHAR_T == 2
2201 /* This is the only case which has to process surrogates, thus
2202 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002203 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204#else
2205 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002206 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207#endif
2208 break;
2209 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002210 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002213 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214}
2215
Alexander Belopolsky40018472011-02-26 01:02:56 +00002216PyObject *
2217PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002218{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 if (size < 0) {
2220 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002221 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 return NULL;
2223 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002224 if (u != NULL)
2225 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2226 else
2227 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002228}
2229
Alexander Belopolsky40018472011-02-26 01:02:56 +00002230PyObject *
2231PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002232{
2233 size_t size = strlen(u);
2234 if (size > PY_SSIZE_T_MAX) {
2235 PyErr_SetString(PyExc_OverflowError, "input too long");
2236 return NULL;
2237 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002238 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239}
2240
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002241PyObject *
2242_PyUnicode_FromId(_Py_Identifier *id)
2243{
2244 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002245 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2246 strlen(id->string),
2247 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002248 if (!id->object)
2249 return NULL;
2250 PyUnicode_InternInPlace(&id->object);
2251 assert(!id->next);
2252 id->next = static_strings;
2253 static_strings = id;
2254 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002255 return id->object;
2256}
2257
2258void
2259_PyUnicode_ClearStaticStrings()
2260{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002261 _Py_Identifier *tmp, *s = static_strings;
2262 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002263 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002264 tmp = s->next;
2265 s->next = NULL;
2266 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002267 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002268 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002269}
2270
Benjamin Peterson0df54292012-03-26 14:50:32 -04002271/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002272
Victor Stinnerd3f08822012-05-29 12:57:52 +02002273PyObject*
2274_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002275{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002276 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002277 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002278 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002279#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002280 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002281#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002282 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002283 }
Victor Stinner785938e2011-12-11 20:09:03 +01002284 unicode = PyUnicode_New(size, 127);
2285 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002286 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002287 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2288 assert(_PyUnicode_CheckConsistency(unicode, 1));
2289 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002290}
2291
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002292static Py_UCS4
2293kind_maxchar_limit(unsigned int kind)
2294{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002295 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002296 case PyUnicode_1BYTE_KIND:
2297 return 0x80;
2298 case PyUnicode_2BYTE_KIND:
2299 return 0x100;
2300 case PyUnicode_4BYTE_KIND:
2301 return 0x10000;
2302 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002303 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002304 }
2305}
2306
Victor Stinner702c7342011-10-05 13:50:52 +02002307static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002308_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002311 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002312
Serhiy Storchaka678db842013-01-26 12:16:36 +02002313 if (size == 0)
2314 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002315 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002316 if (size == 1)
2317 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002318
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002319 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002320 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321 if (!res)
2322 return NULL;
2323 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002324 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002326}
2327
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328static PyObject*
2329_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330{
2331 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002332 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002333
Serhiy Storchaka678db842013-01-26 12:16:36 +02002334 if (size == 0)
2335 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002336 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002337 if (size == 1)
2338 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002339
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002340 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002341 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 if (!res)
2343 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002344 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002346 else {
2347 _PyUnicode_CONVERT_BYTES(
2348 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2349 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002350 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return res;
2352}
2353
Victor Stinnere57b1c02011-09-28 22:20:48 +02002354static PyObject*
2355_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356{
2357 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002358 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359
Serhiy Storchaka678db842013-01-26 12:16:36 +02002360 if (size == 0)
2361 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002362 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002363 if (size == 1)
2364 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002365
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002366 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002367 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002368 if (!res)
2369 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002370 if (max_char < 256)
2371 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2372 PyUnicode_1BYTE_DATA(res));
2373 else if (max_char < 0x10000)
2374 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2375 PyUnicode_2BYTE_DATA(res));
2376 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002378 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 return res;
2380}
2381
2382PyObject*
2383PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2384{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002385 if (size < 0) {
2386 PyErr_SetString(PyExc_ValueError, "size must be positive");
2387 return NULL;
2388 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002389 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002391 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002393 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002395 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002396 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002397 PyErr_SetString(PyExc_SystemError, "invalid kind");
2398 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400}
2401
Victor Stinnerece58de2012-04-23 23:36:38 +02002402Py_UCS4
2403_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2404{
2405 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002406 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002407
2408 assert(PyUnicode_IS_READY(unicode));
2409 assert(0 <= start);
2410 assert(end <= PyUnicode_GET_LENGTH(unicode));
2411 assert(start <= end);
2412
2413 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2414 return PyUnicode_MAX_CHAR_VALUE(unicode);
2415
2416 if (start == end)
2417 return 127;
2418
Victor Stinner94d558b2012-04-27 22:26:58 +02002419 if (PyUnicode_IS_ASCII(unicode))
2420 return 127;
2421
Victor Stinnerece58de2012-04-23 23:36:38 +02002422 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002423 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002424 endptr = (char *)startptr + end * kind;
2425 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002426 switch(kind) {
2427 case PyUnicode_1BYTE_KIND:
2428 return ucs1lib_find_max_char(startptr, endptr);
2429 case PyUnicode_2BYTE_KIND:
2430 return ucs2lib_find_max_char(startptr, endptr);
2431 case PyUnicode_4BYTE_KIND:
2432 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002433 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002434 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002435 }
2436}
2437
Victor Stinner25a4b292011-10-06 12:31:55 +02002438/* Ensure that a string uses the most efficient storage, if it is not the
2439 case: create a new string with of the right kind. Write NULL into *p_unicode
2440 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002441static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002442unicode_adjust_maxchar(PyObject **p_unicode)
2443{
2444 PyObject *unicode, *copy;
2445 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002446 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002447 unsigned int kind;
2448
2449 assert(p_unicode != NULL);
2450 unicode = *p_unicode;
2451 assert(PyUnicode_IS_READY(unicode));
2452 if (PyUnicode_IS_ASCII(unicode))
2453 return;
2454
2455 len = PyUnicode_GET_LENGTH(unicode);
2456 kind = PyUnicode_KIND(unicode);
2457 if (kind == PyUnicode_1BYTE_KIND) {
2458 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002459 max_char = ucs1lib_find_max_char(u, u + len);
2460 if (max_char >= 128)
2461 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002462 }
2463 else if (kind == PyUnicode_2BYTE_KIND) {
2464 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002465 max_char = ucs2lib_find_max_char(u, u + len);
2466 if (max_char >= 256)
2467 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002468 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002469 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002470 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002471 max_char = ucs4lib_find_max_char(u, u + len);
2472 if (max_char >= 0x10000)
2473 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002474 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002475 else
2476 Py_UNREACHABLE();
2477
Victor Stinner25a4b292011-10-06 12:31:55 +02002478 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002479 if (copy != NULL)
2480 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002481 Py_DECREF(unicode);
2482 *p_unicode = copy;
2483}
2484
Victor Stinner034f6cf2011-09-30 02:26:44 +02002485PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002486_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002487{
Victor Stinner87af4f22011-11-21 23:03:47 +01002488 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002489 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002490
Victor Stinner034f6cf2011-09-30 02:26:44 +02002491 if (!PyUnicode_Check(unicode)) {
2492 PyErr_BadInternalCall();
2493 return NULL;
2494 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002495 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002496 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002497
Victor Stinner87af4f22011-11-21 23:03:47 +01002498 length = PyUnicode_GET_LENGTH(unicode);
2499 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002500 if (!copy)
2501 return NULL;
2502 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2503
Christian Heimesf051e432016-09-13 20:22:02 +02002504 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002505 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002506 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002507 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002508}
2509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510
Victor Stinnerbc603d12011-10-02 01:00:40 +02002511/* Widen Unicode objects to larger buffers. Don't write terminating null
2512 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002514static void*
2515unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002517 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002518
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002519 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002520 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002521 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002522 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002523 if (!result)
2524 return PyErr_NoMemory();
2525 assert(skind == PyUnicode_1BYTE_KIND);
2526 _PyUnicode_CONVERT_BYTES(
2527 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002528 (const Py_UCS1 *)data,
2529 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002530 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002532 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002533 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002534 if (!result)
2535 return PyErr_NoMemory();
2536 if (skind == PyUnicode_2BYTE_KIND) {
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002539 (const Py_UCS2 *)data,
2540 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002541 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002543 else {
2544 assert(skind == PyUnicode_1BYTE_KIND);
2545 _PyUnicode_CONVERT_BYTES(
2546 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002547 (const Py_UCS1 *)data,
2548 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002549 result);
2550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002553 Py_UNREACHABLE();
2554 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556}
2557
2558static Py_UCS4*
2559as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2560 int copy_null)
2561{
2562 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002563 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 Py_ssize_t len, targetlen;
2565 if (PyUnicode_READY(string) == -1)
2566 return NULL;
2567 kind = PyUnicode_KIND(string);
2568 data = PyUnicode_DATA(string);
2569 len = PyUnicode_GET_LENGTH(string);
2570 targetlen = len;
2571 if (copy_null)
2572 targetlen++;
2573 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002574 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 if (!target) {
2576 PyErr_NoMemory();
2577 return NULL;
2578 }
2579 }
2580 else {
2581 if (targetsize < targetlen) {
2582 PyErr_Format(PyExc_SystemError,
2583 "string is longer than the buffer");
2584 if (copy_null && 0 < targetsize)
2585 target[0] = 0;
2586 return NULL;
2587 }
2588 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002589 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002590 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002591 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002593 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002594 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002595 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2596 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002597 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002598 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002599 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002600 else {
2601 Py_UNREACHABLE();
2602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 if (copy_null)
2604 target[len] = 0;
2605 return target;
2606}
2607
2608Py_UCS4*
2609PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2610 int copy_null)
2611{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002612 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 PyErr_BadInternalCall();
2614 return NULL;
2615 }
2616 return as_ucs4(string, target, targetsize, copy_null);
2617}
2618
2619Py_UCS4*
2620PyUnicode_AsUCS4Copy(PyObject *string)
2621{
2622 return as_ucs4(string, NULL, 0, 1);
2623}
2624
Victor Stinner15a11362012-10-06 23:48:20 +02002625/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002626 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2627 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2628#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002629
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630static int
2631unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2632 Py_ssize_t width, Py_ssize_t precision)
2633{
2634 Py_ssize_t length, fill, arglen;
2635 Py_UCS4 maxchar;
2636
2637 if (PyUnicode_READY(str) == -1)
2638 return -1;
2639
2640 length = PyUnicode_GET_LENGTH(str);
2641 if ((precision == -1 || precision >= length)
2642 && width <= length)
2643 return _PyUnicodeWriter_WriteStr(writer, str);
2644
2645 if (precision != -1)
2646 length = Py_MIN(precision, length);
2647
2648 arglen = Py_MAX(length, width);
2649 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2650 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2651 else
2652 maxchar = writer->maxchar;
2653
2654 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2655 return -1;
2656
2657 if (width > length) {
2658 fill = width - length;
2659 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2660 return -1;
2661 writer->pos += fill;
2662 }
2663
2664 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2665 str, 0, length);
2666 writer->pos += length;
2667 return 0;
2668}
2669
2670static int
Victor Stinner998b8062018-09-12 00:23:25 +02002671unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 Py_ssize_t width, Py_ssize_t precision)
2673{
2674 /* UTF-8 */
2675 Py_ssize_t length;
2676 PyObject *unicode;
2677 int res;
2678
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002679 if (precision == -1) {
2680 length = strlen(str);
2681 }
2682 else {
2683 length = 0;
2684 while (length < precision && str[length]) {
2685 length++;
2686 }
2687 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002688 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2689 if (unicode == NULL)
2690 return -1;
2691
2692 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2693 Py_DECREF(unicode);
2694 return res;
2695}
2696
Victor Stinner96865452011-03-01 23:44:09 +00002697static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002698unicode_fromformat_arg(_PyUnicodeWriter *writer,
2699 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002700{
Victor Stinnere215d962012-10-06 23:03:36 +02002701 const char *p;
2702 Py_ssize_t len;
2703 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704 Py_ssize_t width;
2705 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002706 int longflag;
2707 int longlongflag;
2708 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002709 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002710
2711 p = f;
2712 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002713 zeropad = 0;
2714 if (*f == '0') {
2715 zeropad = 1;
2716 f++;
2717 }
Victor Stinner96865452011-03-01 23:44:09 +00002718
2719 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 width = -1;
2721 if (Py_ISDIGIT((unsigned)*f)) {
2722 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002723 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002724 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002725 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002726 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002727 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002728 return NULL;
2729 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002731 f++;
2732 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002733 }
2734 precision = -1;
2735 if (*f == '.') {
2736 f++;
2737 if (Py_ISDIGIT((unsigned)*f)) {
2738 precision = (*f - '0');
2739 f++;
2740 while (Py_ISDIGIT((unsigned)*f)) {
2741 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2742 PyErr_SetString(PyExc_ValueError,
2743 "precision too big");
2744 return NULL;
2745 }
2746 precision = (precision * 10) + (*f - '0');
2747 f++;
2748 }
2749 }
Victor Stinner96865452011-03-01 23:44:09 +00002750 if (*f == '%') {
2751 /* "%.3%s" => f points to "3" */
2752 f--;
2753 }
2754 }
2755 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002756 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002757 f--;
2758 }
Victor Stinner96865452011-03-01 23:44:09 +00002759
2760 /* Handle %ld, %lu, %lld and %llu. */
2761 longflag = 0;
2762 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002763 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002764 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002765 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002766 longflag = 1;
2767 ++f;
2768 }
Victor Stinner96865452011-03-01 23:44:09 +00002769 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002770 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002771 longlongflag = 1;
2772 f += 2;
2773 }
Victor Stinner96865452011-03-01 23:44:09 +00002774 }
2775 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002776 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002777 size_tflag = 1;
2778 ++f;
2779 }
Victor Stinnere215d962012-10-06 23:03:36 +02002780
2781 if (f[1] == '\0')
2782 writer->overallocate = 0;
2783
2784 switch (*f) {
2785 case 'c':
2786 {
2787 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002788 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002789 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002790 "character argument not in range(0x110000)");
2791 return NULL;
2792 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002793 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002794 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002795 break;
2796 }
2797
2798 case 'i':
2799 case 'd':
2800 case 'u':
2801 case 'x':
2802 {
2803 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002804 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002805 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002806
2807 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002808 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002809 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002810 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002811 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002812 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002813 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002814 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002815 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002816 va_arg(*vargs, size_t));
2817 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002818 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002819 va_arg(*vargs, unsigned int));
2820 }
2821 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002822 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002823 }
2824 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002825 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002826 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002827 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002828 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002829 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002830 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002831 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002832 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002833 va_arg(*vargs, Py_ssize_t));
2834 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002835 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002836 va_arg(*vargs, int));
2837 }
2838 assert(len >= 0);
2839
Victor Stinnere215d962012-10-06 23:03:36 +02002840 if (precision < len)
2841 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002842
2843 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002844 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2845 return NULL;
2846
Victor Stinnere215d962012-10-06 23:03:36 +02002847 if (width > precision) {
2848 Py_UCS4 fillchar;
2849 fill = width - precision;
2850 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002851 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2852 return NULL;
2853 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002854 }
Victor Stinner15a11362012-10-06 23:48:20 +02002855 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002856 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002857 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2858 return NULL;
2859 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002860 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002861
Victor Stinner4a587072013-11-19 12:54:53 +01002862 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2863 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002864 break;
2865 }
2866
2867 case 'p':
2868 {
2869 char number[MAX_LONG_LONG_CHARS];
2870
2871 len = sprintf(number, "%p", va_arg(*vargs, void*));
2872 assert(len >= 0);
2873
2874 /* %p is ill-defined: ensure leading 0x. */
2875 if (number[1] == 'X')
2876 number[1] = 'x';
2877 else if (number[1] != 'x') {
2878 memmove(number + 2, number,
2879 strlen(number) + 1);
2880 number[0] = '0';
2881 number[1] = 'x';
2882 len += 2;
2883 }
2884
Victor Stinner4a587072013-11-19 12:54:53 +01002885 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002886 return NULL;
2887 break;
2888 }
2889
2890 case 's':
2891 {
2892 /* UTF-8 */
2893 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002894 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002895 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002896 break;
2897 }
2898
2899 case 'U':
2900 {
2901 PyObject *obj = va_arg(*vargs, PyObject *);
2902 assert(obj && _PyUnicode_CHECK(obj));
2903
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002904 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002905 return NULL;
2906 break;
2907 }
2908
2909 case 'V':
2910 {
2911 PyObject *obj = va_arg(*vargs, PyObject *);
2912 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002913 if (obj) {
2914 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002915 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return NULL;
2917 }
2918 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002919 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002920 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002921 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002922 }
2923 break;
2924 }
2925
2926 case 'S':
2927 {
2928 PyObject *obj = va_arg(*vargs, PyObject *);
2929 PyObject *str;
2930 assert(obj);
2931 str = PyObject_Str(obj);
2932 if (!str)
2933 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002934 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002935 Py_DECREF(str);
2936 return NULL;
2937 }
2938 Py_DECREF(str);
2939 break;
2940 }
2941
2942 case 'R':
2943 {
2944 PyObject *obj = va_arg(*vargs, PyObject *);
2945 PyObject *repr;
2946 assert(obj);
2947 repr = PyObject_Repr(obj);
2948 if (!repr)
2949 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 Py_DECREF(repr);
2952 return NULL;
2953 }
2954 Py_DECREF(repr);
2955 break;
2956 }
2957
2958 case 'A':
2959 {
2960 PyObject *obj = va_arg(*vargs, PyObject *);
2961 PyObject *ascii;
2962 assert(obj);
2963 ascii = PyObject_ASCII(obj);
2964 if (!ascii)
2965 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002966 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002967 Py_DECREF(ascii);
2968 return NULL;
2969 }
2970 Py_DECREF(ascii);
2971 break;
2972 }
2973
2974 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002975 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002976 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002977 break;
2978
2979 default:
2980 /* if we stumble upon an unknown formatting code, copy the rest
2981 of the format string to the output string. (we cannot just
2982 skip the code, since there's no way to know what's in the
2983 argument list) */
2984 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002985 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002986 return NULL;
2987 f = p+len;
2988 return f;
2989 }
2990
2991 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002992 return f;
2993}
2994
Walter Dörwaldd2034312007-05-18 16:29:38 +00002995PyObject *
2996PyUnicode_FromFormatV(const char *format, va_list vargs)
2997{
Victor Stinnere215d962012-10-06 23:03:36 +02002998 va_list vargs2;
2999 const char *f;
3000 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003001
Victor Stinner8f674cc2013-04-17 23:02:17 +02003002 _PyUnicodeWriter_Init(&writer);
3003 writer.min_length = strlen(format) + 100;
3004 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003005
Benjamin Peterson0c212142016-09-20 20:39:33 -07003006 // Copy varags to be able to pass a reference to a subfunction.
3007 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003008
3009 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003010 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003011 f = unicode_fromformat_arg(&writer, f, &vargs2);
3012 if (f == NULL)
3013 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003015 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003016 const char *p;
3017 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003018
Victor Stinnere215d962012-10-06 23:03:36 +02003019 p = f;
3020 do
3021 {
3022 if ((unsigned char)*p > 127) {
3023 PyErr_Format(PyExc_ValueError,
3024 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3025 "string, got a non-ASCII byte: 0x%02x",
3026 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003027 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003028 }
3029 p++;
3030 }
3031 while (*p != '\0' && *p != '%');
3032 len = p - f;
3033
3034 if (*p == '\0')
3035 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003036
3037 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003038 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003039
3040 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003041 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003042 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003043 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003044 return _PyUnicodeWriter_Finish(&writer);
3045
3046 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003047 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003048 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003049 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003050}
3051
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052PyObject *
3053PyUnicode_FromFormat(const char *format, ...)
3054{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003055 PyObject* ret;
3056 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003057
3058#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003059 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003060#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003061 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003062#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003063 ret = PyUnicode_FromFormatV(format, vargs);
3064 va_end(vargs);
3065 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003066}
3067
Serhiy Storchakac46db922018-10-23 22:58:24 +03003068static Py_ssize_t
3069unicode_get_widechar_size(PyObject *unicode)
3070{
3071 Py_ssize_t res;
3072
3073 assert(unicode != NULL);
3074 assert(_PyUnicode_CHECK(unicode));
3075
3076 if (_PyUnicode_WSTR(unicode) != NULL) {
3077 return PyUnicode_WSTR_LENGTH(unicode);
3078 }
3079 assert(PyUnicode_IS_READY(unicode));
3080
3081 res = _PyUnicode_LENGTH(unicode);
3082#if SIZEOF_WCHAR_T == 2
3083 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3084 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3085 const Py_UCS4 *end = s + res;
3086 for (; s < end; ++s) {
3087 if (*s > 0xFFFF) {
3088 ++res;
3089 }
3090 }
3091 }
3092#endif
3093 return res;
3094}
3095
3096static void
3097unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3098{
3099 const wchar_t *wstr;
3100
3101 assert(unicode != NULL);
3102 assert(_PyUnicode_CHECK(unicode));
3103
3104 wstr = _PyUnicode_WSTR(unicode);
3105 if (wstr != NULL) {
3106 memcpy(w, wstr, size * sizeof(wchar_t));
3107 return;
3108 }
3109 assert(PyUnicode_IS_READY(unicode));
3110
3111 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3112 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3113 for (; size--; ++s, ++w) {
3114 *w = *s;
3115 }
3116 }
3117 else {
3118#if SIZEOF_WCHAR_T == 4
3119 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3120 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3121 for (; size--; ++s, ++w) {
3122 *w = *s;
3123 }
3124#else
3125 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3126 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3127 for (; size--; ++s, ++w) {
3128 Py_UCS4 ch = *s;
3129 if (ch > 0xFFFF) {
3130 assert(ch <= MAX_UNICODE);
3131 /* encode surrogate pair in this case */
3132 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3133 if (!size--)
3134 break;
3135 *w = Py_UNICODE_LOW_SURROGATE(ch);
3136 }
3137 else {
3138 *w = ch;
3139 }
3140 }
3141#endif
3142 }
3143}
3144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145#ifdef HAVE_WCHAR_H
3146
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003147/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003148
Victor Stinnerd88d9832011-09-06 02:00:05 +02003149 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 character) required to convert the unicode object. Ignore size argument.
3151
Victor Stinnerd88d9832011-09-06 02:00:05 +02003152 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003153 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003154 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003155Py_ssize_t
3156PyUnicode_AsWideChar(PyObject *unicode,
3157 wchar_t *w,
3158 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003159{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003160 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003161
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003162 if (unicode == NULL) {
3163 PyErr_BadInternalCall();
3164 return -1;
3165 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003166 if (!PyUnicode_Check(unicode)) {
3167 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003169 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003170
3171 res = unicode_get_widechar_size(unicode);
3172 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003174 }
3175
3176 if (size > res) {
3177 size = res + 1;
3178 }
3179 else {
3180 res = size;
3181 }
3182 unicode_copy_as_widechar(unicode, w, size);
3183 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003184}
3185
Victor Stinner137c34c2010-09-29 10:25:54 +00003186wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003187PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003188 Py_ssize_t *size)
3189{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003190 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003191 Py_ssize_t buflen;
3192
3193 if (unicode == NULL) {
3194 PyErr_BadInternalCall();
3195 return NULL;
3196 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003197 if (!PyUnicode_Check(unicode)) {
3198 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003199 return NULL;
3200 }
3201
Serhiy Storchakac46db922018-10-23 22:58:24 +03003202 buflen = unicode_get_widechar_size(unicode);
3203 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003204 if (buffer == NULL) {
3205 PyErr_NoMemory();
3206 return NULL;
3207 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3209 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003210 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003211 }
3212 else if (wcslen(buffer) != (size_t)buflen) {
3213 PyMem_FREE(buffer);
3214 PyErr_SetString(PyExc_ValueError,
3215 "embedded null character");
3216 return NULL;
3217 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003218 return buffer;
3219}
3220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003221#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003222
Alexander Belopolsky40018472011-02-26 01:02:56 +00003223PyObject *
3224PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003225{
Victor Stinner8faf8212011-12-08 22:14:11 +01003226 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 PyErr_SetString(PyExc_ValueError,
3228 "chr() arg not in range(0x110000)");
3229 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003230 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003231
Victor Stinner985a82a2014-01-03 12:53:47 +01003232 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003233}
3234
Alexander Belopolsky40018472011-02-26 01:02:56 +00003235PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003236PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003238 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003240 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003241 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003242 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 Py_INCREF(obj);
3244 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003245 }
3246 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 /* For a Unicode subtype that's not a Unicode object,
3248 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003249 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003250 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003251 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003252 "Can't convert '%.100s' object to str implicitly",
3253 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003254 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003255}
3256
Alexander Belopolsky40018472011-02-26 01:02:56 +00003257PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003258PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003259 const char *encoding,
3260 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003261{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003262 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003263 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003264
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 PyErr_BadInternalCall();
3267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003269
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003270 /* Decoding bytes objects is the most common case and should be fast */
3271 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003272 if (PyBytes_GET_SIZE(obj) == 0) {
3273 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3274 return NULL;
3275 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003276 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003277 }
3278 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003279 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3280 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003281 }
3282
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003283 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003284 PyErr_SetString(PyExc_TypeError,
3285 "decoding str is not supported");
3286 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003287 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003288
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003289 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3290 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3291 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003292 "decoding to str: need a bytes-like object, %.80s found",
3293 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003294 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003295 }
Tim Petersced69f82003-09-16 20:30:58 +00003296
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003297 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003298 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003299 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3300 return NULL;
3301 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003302 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003304
Serhiy Storchaka05997252013-01-26 12:14:02 +02003305 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003306 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003307 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308}
3309
Victor Stinnerebe17e02016-10-12 13:57:45 +02003310/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3311 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3312 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003313int
3314_Py_normalize_encoding(const char *encoding,
3315 char *lower,
3316 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003318 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003319 char *l;
3320 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003321 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322
Victor Stinner942889a2016-09-05 15:40:10 -07003323 assert(encoding != NULL);
3324
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003325 e = encoding;
3326 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003327 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003328 punct = 0;
3329 while (1) {
3330 char c = *e;
3331 if (c == 0) {
3332 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003333 }
Victor Stinner942889a2016-09-05 15:40:10 -07003334
3335 if (Py_ISALNUM(c) || c == '.') {
3336 if (punct && l != lower) {
3337 if (l == l_end) {
3338 return 0;
3339 }
3340 *l++ = '_';
3341 }
3342 punct = 0;
3343
3344 if (l == l_end) {
3345 return 0;
3346 }
3347 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003348 }
3349 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003350 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003351 }
Victor Stinner942889a2016-09-05 15:40:10 -07003352
3353 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003354 }
3355 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003356 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003357}
3358
Alexander Belopolsky40018472011-02-26 01:02:56 +00003359PyObject *
3360PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003361 Py_ssize_t size,
3362 const char *encoding,
3363 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003364{
3365 PyObject *buffer = NULL, *unicode;
3366 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003367 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3368
Victor Stinner22eb6892019-06-26 00:51:05 +02003369 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3370 return NULL;
3371 }
3372
Victor Stinnered076ed2019-06-26 01:49:32 +02003373 if (size == 0) {
3374 _Py_RETURN_UNICODE_EMPTY();
3375 }
3376
Victor Stinner942889a2016-09-05 15:40:10 -07003377 if (encoding == NULL) {
3378 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3379 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003380
Fred Drakee4315f52000-05-09 19:53:39 +00003381 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003382 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3383 char *lower = buflower;
3384
3385 /* Fast paths */
3386 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3387 lower += 3;
3388 if (*lower == '_') {
3389 /* Match "utf8" and "utf_8" */
3390 lower++;
3391 }
3392
3393 if (lower[0] == '8' && lower[1] == 0) {
3394 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3395 }
3396 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3397 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3398 }
3399 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3400 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3401 }
3402 }
3403 else {
3404 if (strcmp(lower, "ascii") == 0
3405 || strcmp(lower, "us_ascii") == 0) {
3406 return PyUnicode_DecodeASCII(s, size, errors);
3407 }
Steve Dowercc16be82016-09-08 10:35:16 -07003408 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003409 else if (strcmp(lower, "mbcs") == 0) {
3410 return PyUnicode_DecodeMBCS(s, size, errors);
3411 }
3412 #endif
3413 else if (strcmp(lower, "latin1") == 0
3414 || strcmp(lower, "latin_1") == 0
3415 || strcmp(lower, "iso_8859_1") == 0
3416 || strcmp(lower, "iso8859_1") == 0) {
3417 return PyUnicode_DecodeLatin1(s, size, errors);
3418 }
3419 }
Victor Stinner37296e82010-06-10 13:36:23 +00003420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003423 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003424 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003425 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003426 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 if (buffer == NULL)
3428 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003429 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 if (unicode == NULL)
3431 goto onError;
3432 if (!PyUnicode_Check(unicode)) {
3433 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003434 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003435 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003436 encoding,
3437 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 Py_DECREF(unicode);
3439 goto onError;
3440 }
3441 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003442 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003443
Benjamin Peterson29060642009-01-31 22:14:21 +00003444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 Py_XDECREF(buffer);
3446 return NULL;
3447}
3448
Alexander Belopolsky40018472011-02-26 01:02:56 +00003449PyObject *
3450PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003451 const char *encoding,
3452 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003454 if (!PyUnicode_Check(unicode)) {
3455 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003456 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003457 }
3458
Serhiy Storchaka00939072016-10-27 21:05:49 +03003459 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3460 "PyUnicode_AsDecodedObject() is deprecated; "
3461 "use PyCodec_Decode() to decode from str", 1) < 0)
3462 return NULL;
3463
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003464 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003465 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003466
3467 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003468 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003469}
3470
Alexander Belopolsky40018472011-02-26 01:02:56 +00003471PyObject *
3472PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003473 const char *encoding,
3474 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003475{
3476 PyObject *v;
3477
3478 if (!PyUnicode_Check(unicode)) {
3479 PyErr_BadArgument();
3480 goto onError;
3481 }
3482
Serhiy Storchaka00939072016-10-27 21:05:49 +03003483 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3484 "PyUnicode_AsDecodedUnicode() is deprecated; "
3485 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3486 return NULL;
3487
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003488 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003490
3491 /* Decode via the codec registry */
3492 v = PyCodec_Decode(unicode, encoding, errors);
3493 if (v == NULL)
3494 goto onError;
3495 if (!PyUnicode_Check(v)) {
3496 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003497 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003498 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003499 encoding,
3500 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003501 Py_DECREF(v);
3502 goto onError;
3503 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003504 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003505
Benjamin Peterson29060642009-01-31 22:14:21 +00003506 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003507 return NULL;
3508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510PyObject *
3511PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003512 Py_ssize_t size,
3513 const char *encoding,
3514 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515{
3516 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003517
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003518 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3522 Py_DECREF(unicode);
3523 return v;
3524}
3525
Alexander Belopolsky40018472011-02-26 01:02:56 +00003526PyObject *
3527PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003528 const char *encoding,
3529 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003530{
3531 PyObject *v;
3532
3533 if (!PyUnicode_Check(unicode)) {
3534 PyErr_BadArgument();
3535 goto onError;
3536 }
3537
Serhiy Storchaka00939072016-10-27 21:05:49 +03003538 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3539 "PyUnicode_AsEncodedObject() is deprecated; "
3540 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3541 "or PyCodec_Encode() for generic encoding", 1) < 0)
3542 return NULL;
3543
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003544 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003545 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003546
3547 /* Encode via the codec registry */
3548 v = PyCodec_Encode(unicode, encoding, errors);
3549 if (v == NULL)
3550 goto onError;
3551 return v;
3552
Benjamin Peterson29060642009-01-31 22:14:21 +00003553 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003554 return NULL;
3555}
3556
Victor Stinner1b579672011-12-17 05:47:23 +01003557
Victor Stinner2cba6b82018-01-10 22:46:15 +01003558static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003559unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003560 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003561{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003562 Py_ssize_t wlen;
3563 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3564 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003565 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003566 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003567
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003568 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003569 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003570 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003571 return NULL;
3572 }
3573
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003574 char *str;
3575 size_t error_pos;
3576 const char *reason;
3577 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003578 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003579 PyMem_Free(wstr);
3580
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003581 if (res != 0) {
3582 if (res == -2) {
3583 PyObject *exc;
3584 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3585 "locale", unicode,
3586 (Py_ssize_t)error_pos,
3587 (Py_ssize_t)(error_pos+1),
3588 reason);
3589 if (exc != NULL) {
3590 PyCodec_StrictErrors(exc);
3591 Py_DECREF(exc);
3592 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003593 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003594 else if (res == -3) {
3595 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3596 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003597 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003598 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003599 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003600 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003602
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003603 PyObject *bytes = PyBytes_FromString(str);
3604 PyMem_RawFree(str);
3605 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003606}
3607
Victor Stinnerad158722010-10-27 00:25:46 +00003608PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003609PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3610{
Victor Stinner709d23d2019-05-02 14:56:30 -04003611 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3612 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003613}
3614
3615PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003616PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003617{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003618 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003619 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003620 return unicode_encode_utf8(unicode,
3621 interp->fs_codec.error_handler,
3622 interp->fs_codec.errors);
3623 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003624#ifndef _Py_FORCE_UTF8_FS_ENCODING
3625 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003626 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003627 interp->fs_codec.encoding,
3628 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003629 }
Victor Stinnerad158722010-10-27 00:25:46 +00003630#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003631 else {
3632 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3633 machinery is not ready and so cannot be used:
3634 use wcstombs() in this case. */
3635 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3636 assert(filesystem_errors != NULL);
3637 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3638 assert(errors != _Py_ERROR_UNKNOWN);
3639#ifdef _Py_FORCE_UTF8_FS_ENCODING
3640 return unicode_encode_utf8(unicode, errors, NULL);
3641#else
3642 return unicode_encode_locale(unicode, errors, 0);
3643#endif
3644 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003645}
3646
Alexander Belopolsky40018472011-02-26 01:02:56 +00003647PyObject *
3648PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003649 const char *encoding,
3650 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651{
3652 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003653 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003654
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 if (!PyUnicode_Check(unicode)) {
3656 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 }
Fred Drakee4315f52000-05-09 19:53:39 +00003659
Victor Stinner22eb6892019-06-26 00:51:05 +02003660 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3661 return NULL;
3662 }
3663
Victor Stinner942889a2016-09-05 15:40:10 -07003664 if (encoding == NULL) {
3665 return _PyUnicode_AsUTF8String(unicode, errors);
3666 }
3667
Fred Drakee4315f52000-05-09 19:53:39 +00003668 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003669 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3670 char *lower = buflower;
3671
3672 /* Fast paths */
3673 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3674 lower += 3;
3675 if (*lower == '_') {
3676 /* Match "utf8" and "utf_8" */
3677 lower++;
3678 }
3679
3680 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003682 }
3683 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3684 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3685 }
3686 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3687 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3688 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003689 }
Victor Stinner942889a2016-09-05 15:40:10 -07003690 else {
3691 if (strcmp(lower, "ascii") == 0
3692 || strcmp(lower, "us_ascii") == 0) {
3693 return _PyUnicode_AsASCIIString(unicode, errors);
3694 }
Steve Dowercc16be82016-09-08 10:35:16 -07003695#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003696 else if (strcmp(lower, "mbcs") == 0) {
3697 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3698 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003699#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003700 else if (strcmp(lower, "latin1") == 0 ||
3701 strcmp(lower, "latin_1") == 0 ||
3702 strcmp(lower, "iso_8859_1") == 0 ||
3703 strcmp(lower, "iso8859_1") == 0) {
3704 return _PyUnicode_AsLatin1String(unicode, errors);
3705 }
3706 }
Victor Stinner37296e82010-06-10 13:36:23 +00003707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708
3709 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003710 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003712 return NULL;
3713
3714 /* The normal path */
3715 if (PyBytes_Check(v))
3716 return v;
3717
3718 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003719 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003720 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003721 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003722
3723 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003724 "encoder %s returned bytearray instead of bytes; "
3725 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003726 encoding);
3727 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003728 Py_DECREF(v);
3729 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003730 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003731
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003732 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3733 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003734 Py_DECREF(v);
3735 return b;
3736 }
3737
3738 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003739 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003740 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003741 encoding,
3742 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003743 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003744 return NULL;
3745}
3746
Alexander Belopolsky40018472011-02-26 01:02:56 +00003747PyObject *
3748PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003749 const char *encoding,
3750 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003751{
3752 PyObject *v;
3753
3754 if (!PyUnicode_Check(unicode)) {
3755 PyErr_BadArgument();
3756 goto onError;
3757 }
3758
Serhiy Storchaka00939072016-10-27 21:05:49 +03003759 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3760 "PyUnicode_AsEncodedUnicode() is deprecated; "
3761 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3762 return NULL;
3763
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003764 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003766
3767 /* Encode via the codec registry */
3768 v = PyCodec_Encode(unicode, encoding, errors);
3769 if (v == NULL)
3770 goto onError;
3771 if (!PyUnicode_Check(v)) {
3772 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003773 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003774 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003775 encoding,
3776 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003777 Py_DECREF(v);
3778 goto onError;
3779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003781
Benjamin Peterson29060642009-01-31 22:14:21 +00003782 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 return NULL;
3784}
3785
Victor Stinner2cba6b82018-01-10 22:46:15 +01003786static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003787unicode_decode_locale(const char *str, Py_ssize_t len,
3788 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003789{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003790 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3791 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003792 return NULL;
3793 }
3794
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003795 wchar_t *wstr;
3796 size_t wlen;
3797 const char *reason;
3798 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003799 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003800 if (res != 0) {
3801 if (res == -2) {
3802 PyObject *exc;
3803 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3804 "locale", str, len,
3805 (Py_ssize_t)wlen,
3806 (Py_ssize_t)(wlen + 1),
3807 reason);
3808 if (exc != NULL) {
3809 PyCodec_StrictErrors(exc);
3810 Py_DECREF(exc);
3811 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003812 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003813 else if (res == -3) {
3814 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3815 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003816 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003817 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003818 }
Victor Stinner2f197072011-12-17 07:08:30 +01003819 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003820 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003821
3822 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3823 PyMem_RawFree(wstr);
3824 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003825}
3826
3827PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003828PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3829 const char *errors)
3830{
Victor Stinner709d23d2019-05-02 14:56:30 -04003831 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3832 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003833}
3834
3835PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003836PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003837{
3838 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003839 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3840 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003841}
3842
3843
3844PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003845PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003846 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003847 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3848}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003849
Christian Heimes5894ba72007-11-04 11:43:14 +00003850PyObject*
3851PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3852{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003853 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003854 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003855 return unicode_decode_utf8(s, size,
3856 interp->fs_codec.error_handler,
3857 interp->fs_codec.errors,
3858 NULL);
3859 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003860#ifndef _Py_FORCE_UTF8_FS_ENCODING
3861 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003862 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003863 interp->fs_codec.encoding,
3864 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003865 }
Victor Stinnerad158722010-10-27 00:25:46 +00003866#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003867 else {
3868 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3869 machinery is not ready and so cannot be used:
3870 use mbstowcs() in this case. */
3871 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3872 assert(filesystem_errors != NULL);
3873 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3874 assert(errors != _Py_ERROR_UNKNOWN);
3875#ifdef _Py_FORCE_UTF8_FS_ENCODING
3876 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3877#else
3878 return unicode_decode_locale(s, size, errors, 0);
3879#endif
3880 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003881}
3882
Martin v. Löwis011e8422009-05-05 04:43:17 +00003883
3884int
3885PyUnicode_FSConverter(PyObject* arg, void* addr)
3886{
Brett Cannonec6ce872016-09-06 15:50:29 -07003887 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003888 PyObject *output = NULL;
3889 Py_ssize_t size;
3890 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003891 if (arg == NULL) {
3892 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003893 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003894 return 1;
3895 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003896 path = PyOS_FSPath(arg);
3897 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003898 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003899 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003900 if (PyBytes_Check(path)) {
3901 output = path;
3902 }
3903 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3904 output = PyUnicode_EncodeFSDefault(path);
3905 Py_DECREF(path);
3906 if (!output) {
3907 return 0;
3908 }
3909 assert(PyBytes_Check(output));
3910 }
3911
Victor Stinner0ea2a462010-04-30 00:22:08 +00003912 size = PyBytes_GET_SIZE(output);
3913 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003914 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003915 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003916 Py_DECREF(output);
3917 return 0;
3918 }
3919 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003920 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003921}
3922
3923
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003924int
3925PyUnicode_FSDecoder(PyObject* arg, void* addr)
3926{
Brett Cannona5711202016-09-06 19:36:01 -07003927 int is_buffer = 0;
3928 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003929 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003930 if (arg == NULL) {
3931 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003932 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003933 return 1;
3934 }
Brett Cannona5711202016-09-06 19:36:01 -07003935
3936 is_buffer = PyObject_CheckBuffer(arg);
3937 if (!is_buffer) {
3938 path = PyOS_FSPath(arg);
3939 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003940 return 0;
3941 }
Brett Cannona5711202016-09-06 19:36:01 -07003942 }
3943 else {
3944 path = arg;
3945 Py_INCREF(arg);
3946 }
3947
3948 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003949 output = path;
3950 }
3951 else if (PyBytes_Check(path) || is_buffer) {
3952 PyObject *path_bytes = NULL;
3953
3954 if (!PyBytes_Check(path) &&
3955 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003956 "path should be string, bytes, or os.PathLike, not %.200s",
3957 Py_TYPE(arg)->tp_name)) {
3958 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003959 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003960 }
3961 path_bytes = PyBytes_FromObject(path);
3962 Py_DECREF(path);
3963 if (!path_bytes) {
3964 return 0;
3965 }
3966 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3967 PyBytes_GET_SIZE(path_bytes));
3968 Py_DECREF(path_bytes);
3969 if (!output) {
3970 return 0;
3971 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003972 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003973 else {
3974 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003975 "path should be string, bytes, or os.PathLike, not %.200s",
3976 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003977 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003978 return 0;
3979 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003980 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003981 Py_DECREF(output);
3982 return 0;
3983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003985 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003986 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003987 Py_DECREF(output);
3988 return 0;
3989 }
3990 *(PyObject**)addr = output;
3991 return Py_CLEANUP_SUPPORTED;
3992}
3993
3994
Inada Naoki02a4d572020-02-27 13:48:59 +09003995static int unicode_fill_utf8(PyObject *unicode);
3996
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003997const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003999{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004000 if (!PyUnicode_Check(unicode)) {
4001 PyErr_BadArgument();
4002 return NULL;
4003 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004004 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004005 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004007 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004008 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 return NULL;
4010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 }
4012
4013 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004014 *psize = PyUnicode_UTF8_LENGTH(unicode);
4015 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004016}
4017
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004018const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4022}
4023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024Py_UNICODE *
4025PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 return NULL;
4030 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004031 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4032 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004034 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004035 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036
Serhiy Storchakac46db922018-10-23 22:58:24 +03004037 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4038 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4039 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004042 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4043 if (w == NULL) {
4044 PyErr_NoMemory();
4045 return NULL;
4046 }
4047 unicode_copy_as_widechar(unicode, w, wlen + 1);
4048 _PyUnicode_WSTR(unicode) = w;
4049 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4050 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 }
4052 }
4053 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004054 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004055 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004056}
4057
Alexander Belopolsky40018472011-02-26 01:02:56 +00004058Py_UNICODE *
4059PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062}
4063
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004064const Py_UNICODE *
4065_PyUnicode_AsUnicode(PyObject *unicode)
4066{
4067 Py_ssize_t size;
4068 const Py_UNICODE *wstr;
4069
4070 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4071 if (wstr && wcslen(wstr) != (size_t)size) {
4072 PyErr_SetString(PyExc_ValueError, "embedded null character");
4073 return NULL;
4074 }
4075 return wstr;
4076}
4077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004078
Alexander Belopolsky40018472011-02-26 01:02:56 +00004079Py_ssize_t
4080PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081{
4082 if (!PyUnicode_Check(unicode)) {
4083 PyErr_BadArgument();
4084 goto onError;
4085 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004086 if (_PyUnicode_WSTR(unicode) == NULL) {
4087 if (PyUnicode_AsUnicode(unicode) == NULL)
4088 goto onError;
4089 }
4090 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 return -1;
4094}
4095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096Py_ssize_t
4097PyUnicode_GetLength(PyObject *unicode)
4098{
Victor Stinner07621332012-06-16 04:53:46 +02004099 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 PyErr_BadArgument();
4101 return -1;
4102 }
Victor Stinner07621332012-06-16 04:53:46 +02004103 if (PyUnicode_READY(unicode) == -1)
4104 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 return PyUnicode_GET_LENGTH(unicode);
4106}
4107
4108Py_UCS4
4109PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4110{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004111 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004112 int kind;
4113
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004114 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004115 PyErr_BadArgument();
4116 return (Py_UCS4)-1;
4117 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004118 if (PyUnicode_READY(unicode) == -1) {
4119 return (Py_UCS4)-1;
4120 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004121 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004122 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 return (Py_UCS4)-1;
4124 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004125 data = PyUnicode_DATA(unicode);
4126 kind = PyUnicode_KIND(unicode);
4127 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128}
4129
4130int
4131PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4132{
4133 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004134 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135 return -1;
4136 }
Victor Stinner488fa492011-12-12 00:01:39 +01004137 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004138 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004139 PyErr_SetString(PyExc_IndexError, "string index out of range");
4140 return -1;
4141 }
Victor Stinner488fa492011-12-12 00:01:39 +01004142 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004143 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004144 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4145 PyErr_SetString(PyExc_ValueError, "character out of range");
4146 return -1;
4147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4149 index, ch);
4150 return 0;
4151}
4152
Alexander Belopolsky40018472011-02-26 01:02:56 +00004153const char *
4154PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004155{
Victor Stinner42cb4622010-09-01 19:39:01 +00004156 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004157}
4158
Victor Stinner554f3f02010-06-16 23:33:54 +00004159/* create or adjust a UnicodeDecodeError */
4160static void
4161make_decode_exception(PyObject **exceptionObject,
4162 const char *encoding,
4163 const char *input, Py_ssize_t length,
4164 Py_ssize_t startpos, Py_ssize_t endpos,
4165 const char *reason)
4166{
4167 if (*exceptionObject == NULL) {
4168 *exceptionObject = PyUnicodeDecodeError_Create(
4169 encoding, input, length, startpos, endpos, reason);
4170 }
4171 else {
4172 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4173 goto onError;
4174 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4175 goto onError;
4176 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4177 goto onError;
4178 }
4179 return;
4180
4181onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004182 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004183}
4184
Steve Dowercc16be82016-09-08 10:35:16 -07004185#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004186static int
4187widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4188{
4189 if (newsize > *size) {
4190 wchar_t *newbuf = *buf;
4191 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4192 PyErr_NoMemory();
4193 return -1;
4194 }
4195 *buf = newbuf;
4196 }
4197 *size = newsize;
4198 return 0;
4199}
4200
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201/* error handling callback helper:
4202 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004203 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 and adjust various state variables.
4205 return 0 on success, -1 on error
4206*/
4207
Alexander Belopolsky40018472011-02-26 01:02:56 +00004208static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004209unicode_decode_call_errorhandler_wchar(
4210 const char *errors, PyObject **errorHandler,
4211 const char *encoding, const char *reason,
4212 const char **input, const char **inend, Py_ssize_t *startinpos,
4213 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004214 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004216 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217
4218 PyObject *restuple = NULL;
4219 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004220 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004221 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004222 Py_ssize_t requiredsize;
4223 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004224 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004225 wchar_t *repwstr;
4226 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004227
4228 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 *errorHandler = PyCodec_LookupError(errors);
4230 if (*errorHandler == NULL)
4231 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 }
4233
Victor Stinner554f3f02010-06-16 23:33:54 +00004234 make_decode_exception(exceptionObject,
4235 encoding,
4236 *input, *inend - *input,
4237 *startinpos, *endinpos,
4238 reason);
4239 if (*exceptionObject == NULL)
4240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241
Petr Viktorinffd97532020-02-11 17:46:57 +01004242 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004246 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004249 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251
4252 /* Copy back the bytes variables, which might have been modified by the
4253 callback */
4254 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4255 if (!inputobj)
4256 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004257 *input = PyBytes_AS_STRING(inputobj);
4258 insize = PyBytes_GET_SIZE(inputobj);
4259 *inend = *input + insize;
4260 /* we can DECREF safely, as the exception has another reference,
4261 so the object won't go away. */
4262 Py_DECREF(inputobj);
4263
4264 if (newpos<0)
4265 newpos = insize+newpos;
4266 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004267 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 goto onError;
4269 }
4270
4271 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4272 if (repwstr == NULL)
4273 goto onError;
4274 /* need more space? (at least enough for what we
4275 have+the replacement+the rest of the string (starting
4276 at the new input position), so we won't have to check space
4277 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004278 requiredsize = *outpos;
4279 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4280 goto overflow;
4281 requiredsize += repwlen;
4282 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4283 goto overflow;
4284 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004285 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004287 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004289 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004291 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004293 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 *endinpos = newpos;
4296 *inptr = *input + newpos;
4297
4298 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004299 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 return 0;
4301
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004302 overflow:
4303 PyErr_SetString(PyExc_OverflowError,
4304 "decoded result is too long for a Python string");
4305
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 onError:
4307 Py_XDECREF(restuple);
4308 return -1;
4309}
Steve Dowercc16be82016-09-08 10:35:16 -07004310#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311
4312static int
4313unicode_decode_call_errorhandler_writer(
4314 const char *errors, PyObject **errorHandler,
4315 const char *encoding, const char *reason,
4316 const char **input, const char **inend, Py_ssize_t *startinpos,
4317 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4318 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4319{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004320 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321
4322 PyObject *restuple = NULL;
4323 PyObject *repunicode = NULL;
4324 Py_ssize_t insize;
4325 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004326 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004327 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004329 int need_to_grow = 0;
4330 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331
4332 if (*errorHandler == NULL) {
4333 *errorHandler = PyCodec_LookupError(errors);
4334 if (*errorHandler == NULL)
4335 goto onError;
4336 }
4337
4338 make_decode_exception(exceptionObject,
4339 encoding,
4340 *input, *inend - *input,
4341 *startinpos, *endinpos,
4342 reason);
4343 if (*exceptionObject == NULL)
4344 goto onError;
4345
Petr Viktorinffd97532020-02-11 17:46:57 +01004346 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347 if (restuple == NULL)
4348 goto onError;
4349 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004350 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351 goto onError;
4352 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004353 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004354 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004355
4356 /* Copy back the bytes variables, which might have been modified by the
4357 callback */
4358 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4359 if (!inputobj)
4360 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004361 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004362 *input = PyBytes_AS_STRING(inputobj);
4363 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004364 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004365 /* we can DECREF safely, as the exception has another reference,
4366 so the object won't go away. */
4367 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004369 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004371 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004372 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004373 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004374 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375
Victor Stinner170ca6f2013-04-18 00:25:28 +02004376 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004377 if (replen > 1) {
4378 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004379 need_to_grow = 1;
4380 }
4381 new_inptr = *input + newpos;
4382 if (*inend - new_inptr > remain) {
4383 /* We don't know the decoding algorithm here so we make the worst
4384 assumption that one byte decodes to one unicode character.
4385 If unfortunately one byte could decode to more unicode characters,
4386 the decoder may write out-of-bound then. Is it possible for the
4387 algorithms using this function? */
4388 writer->min_length += *inend - new_inptr - remain;
4389 need_to_grow = 1;
4390 }
4391 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004392 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004393 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004394 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4395 goto onError;
4396 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004397 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004398 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004401 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004404 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004405 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410}
4411
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412/* --- UTF-7 Codec -------------------------------------------------------- */
4413
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414/* See RFC2152 for details. We encode conservatively and decode liberally. */
4415
4416/* Three simple macros defining base-64. */
4417
4418/* Is c a base-64 character? */
4419
4420#define IS_BASE64(c) \
4421 (((c) >= 'A' && (c) <= 'Z') || \
4422 ((c) >= 'a' && (c) <= 'z') || \
4423 ((c) >= '0' && (c) <= '9') || \
4424 (c) == '+' || (c) == '/')
4425
4426/* given that c is a base-64 character, what is its base-64 value? */
4427
4428#define FROM_BASE64(c) \
4429 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4430 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4431 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4432 (c) == '+' ? 62 : 63)
4433
4434/* What is the base-64 character of the bottom 6 bits of n? */
4435
4436#define TO_BASE64(n) \
4437 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4438
4439/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4440 * decoded as itself. We are permissive on decoding; the only ASCII
4441 * byte not decoding to itself is the + which begins a base64
4442 * string. */
4443
4444#define DECODE_DIRECT(c) \
4445 ((c) <= 127 && (c) != '+')
4446
4447/* The UTF-7 encoder treats ASCII characters differently according to
4448 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4449 * the above). See RFC2152. This array identifies these different
4450 * sets:
4451 * 0 : "Set D"
4452 * alphanumeric and '(),-./:?
4453 * 1 : "Set O"
4454 * !"#$%&*;<=>@[]^_`{|}
4455 * 2 : "whitespace"
4456 * ht nl cr sp
4457 * 3 : special (must be base64 encoded)
4458 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4459 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460
Tim Petersced69f82003-09-16 20:30:58 +00004461static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462char utf7_category[128] = {
4463/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4464 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4465/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4466 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4467/* sp ! " # $ % & ' ( ) * + , - . / */
4468 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4469/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4471/* @ A B C D E F G H I J K L M N O */
4472 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4473/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4475/* ` a b c d e f g h i j k l m n o */
4476 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4477/* p q r s t u v w x y z { | } ~ del */
4478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479};
4480
Antoine Pitrou244651a2009-05-04 18:56:13 +00004481/* ENCODE_DIRECT: this character should be encoded as itself. The
4482 * answer depends on whether we are encoding set O as itself, and also
4483 * on whether we are encoding whitespace as itself. RFC2152 makes it
4484 * clear that the answers to these questions vary between
4485 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004486
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487#define ENCODE_DIRECT(c, directO, directWS) \
4488 ((c) < 128 && (c) > 0 && \
4489 ((utf7_category[(c)] == 0) || \
4490 (directWS && (utf7_category[(c)] == 2)) || \
4491 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492
Alexander Belopolsky40018472011-02-26 01:02:56 +00004493PyObject *
4494PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004495 Py_ssize_t size,
4496 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004498 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4499}
4500
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501/* The decoder. The only state we preserve is our read position,
4502 * i.e. how many characters we have consumed. So if we end in the
4503 * middle of a shift sequence we have to back off the read position
4504 * and the output to the beginning of the sequence, otherwise we lose
4505 * all the shift state (seen bits, number of bits seen, high
4506 * surrogate). */
4507
Alexander Belopolsky40018472011-02-26 01:02:56 +00004508PyObject *
4509PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004510 Py_ssize_t size,
4511 const char *errors,
4512 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004513{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004515 Py_ssize_t startinpos;
4516 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004518 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519 const char *errmsg = "";
4520 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004521 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004522 unsigned int base64bits = 0;
4523 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004524 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 PyObject *errorHandler = NULL;
4526 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004528 if (size == 0) {
4529 if (consumed)
4530 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004531 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004532 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004535 _PyUnicodeWriter_Init(&writer);
4536 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004537
4538 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004539 e = s + size;
4540
4541 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004542 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004544 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004545
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 if (inShift) { /* in a base-64 section */
4547 if (IS_BASE64(ch)) { /* consume a base-64 character */
4548 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4549 base64bits += 6;
4550 s++;
4551 if (base64bits >= 16) {
4552 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004553 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 base64bits -= 16;
4555 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004556 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 if (surrogate) {
4558 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004559 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4560 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004561 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004562 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004564 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 }
4566 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004567 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004568 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
4571 }
Victor Stinner551ac952011-11-29 22:58:13 +01004572 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 /* first surrogate */
4574 surrogate = outCh;
4575 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004577 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
4580 }
4581 }
4582 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004583 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (base64bits > 0) { /* left-over bits */
4585 if (base64bits >= 6) {
4586 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004587 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 errmsg = "partial character in shift sequence";
4589 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 else {
4592 /* Some bits remain; they should be zero */
4593 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004594 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 errmsg = "non-zero padding bits in shift sequence";
4596 goto utf7Error;
4597 }
4598 }
4599 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004600 if (surrogate && DECODE_DIRECT(ch)) {
4601 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4602 goto onError;
4603 }
4604 surrogate = 0;
4605 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 /* '-' is absorbed; other terminating
4607 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004608 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 }
4611 }
4612 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 s++; /* consume '+' */
4615 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004616 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004617 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004618 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004620 else if (s < e && !IS_BASE64(*s)) {
4621 s++;
4622 errmsg = "ill-formed sequence";
4623 goto utf7Error;
4624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004627 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004628 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004630 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004631 }
4632 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004635 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004636 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 else {
4639 startinpos = s-starts;
4640 s++;
4641 errmsg = "unexpected special character";
4642 goto utf7Error;
4643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 errors, &errorHandler,
4649 "utf7", errmsg,
4650 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004651 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004652 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
4654
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 /* end of string */
4656
4657 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4658 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004659 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 if (surrogate ||
4661 (base64bits >= 6) ||
4662 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004664 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 errors, &errorHandler,
4666 "utf7", "unterminated shift sequence",
4667 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004668 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 goto onError;
4670 if (s < e)
4671 goto restart;
4672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674
4675 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004676 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004677 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004678 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004679 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004680 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004681 writer.kind, writer.data, shiftOutStart);
4682 Py_XDECREF(errorHandler);
4683 Py_XDECREF(exc);
4684 _PyUnicodeWriter_Dealloc(&writer);
4685 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004686 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004687 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 }
4689 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004690 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004691 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004692 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694 Py_XDECREF(errorHandler);
4695 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004696 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004697
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004699 Py_XDECREF(errorHandler);
4700 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004701 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004702 return NULL;
4703}
4704
4705
Alexander Belopolsky40018472011-02-26 01:02:56 +00004706PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004707_PyUnicode_EncodeUTF7(PyObject *str,
4708 int base64SetO,
4709 int base64WhiteSpace,
4710 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004712 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004713 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004714 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004715 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004717 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004718 unsigned int base64bits = 0;
4719 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720 char * out;
4721 char * start;
4722
Benjamin Petersonbac79492012-01-14 13:34:47 -05004723 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004724 return NULL;
4725 kind = PyUnicode_KIND(str);
4726 data = PyUnicode_DATA(str);
4727 len = PyUnicode_GET_LENGTH(str);
4728
4729 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004732 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004733 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004734 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004735 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736 if (v == NULL)
4737 return NULL;
4738
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004739 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004741 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742
Antoine Pitrou244651a2009-05-04 18:56:13 +00004743 if (inShift) {
4744 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4745 /* shifting out */
4746 if (base64bits) { /* output remaining bits */
4747 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4748 base64buffer = 0;
4749 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004750 }
4751 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 /* Characters not in the BASE64 set implicitly unshift the sequence
4753 so no '-' is required, except if the character is itself a '-' */
4754 if (IS_BASE64(ch) || ch == '-') {
4755 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004756 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004757 *out++ = (char) ch;
4758 }
4759 else {
4760 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004761 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 else { /* not in a shift sequence */
4764 if (ch == '+') {
4765 *out++ = '+';
4766 *out++ = '-';
4767 }
4768 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4769 *out++ = (char) ch;
4770 }
4771 else {
4772 *out++ = '+';
4773 inShift = 1;
4774 goto encode_char;
4775 }
4776 }
4777 continue;
4778encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004780 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004781
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 /* code first surrogate */
4783 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004784 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004785 while (base64bits >= 6) {
4786 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4787 base64bits -= 6;
4788 }
4789 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004790 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004792 base64bits += 16;
4793 base64buffer = (base64buffer << 16) | ch;
4794 while (base64bits >= 6) {
4795 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4796 base64bits -= 6;
4797 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004798 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004799 if (base64bits)
4800 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4801 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004802 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004803 if (_PyBytes_Resize(&v, out - start) < 0)
4804 return NULL;
4805 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004806}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004807PyObject *
4808PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4809 Py_ssize_t size,
4810 int base64SetO,
4811 int base64WhiteSpace,
4812 const char *errors)
4813{
4814 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004815 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004816 if (tmp == NULL)
4817 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004818 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004819 base64WhiteSpace, errors);
4820 Py_DECREF(tmp);
4821 return result;
4822}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004823
Antoine Pitrou244651a2009-05-04 18:56:13 +00004824#undef IS_BASE64
4825#undef FROM_BASE64
4826#undef TO_BASE64
4827#undef DECODE_DIRECT
4828#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004829
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830/* --- UTF-8 Codec -------------------------------------------------------- */
4831
Alexander Belopolsky40018472011-02-26 01:02:56 +00004832PyObject *
4833PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004834 Py_ssize_t size,
4835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836{
Walter Dörwald69652032004-09-07 20:24:22 +00004837 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4838}
4839
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004840#include "stringlib/asciilib.h"
4841#include "stringlib/codecs.h"
4842#include "stringlib/undef.h"
4843
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004844#include "stringlib/ucs1lib.h"
4845#include "stringlib/codecs.h"
4846#include "stringlib/undef.h"
4847
4848#include "stringlib/ucs2lib.h"
4849#include "stringlib/codecs.h"
4850#include "stringlib/undef.h"
4851
4852#include "stringlib/ucs4lib.h"
4853#include "stringlib/codecs.h"
4854#include "stringlib/undef.h"
4855
Antoine Pitrouab868312009-01-10 15:40:25 +00004856/* Mask to quickly check whether a C 'long' contains a
4857 non-ASCII, UTF8-encoded char. */
4858#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004859# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004860#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004861# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004862#else
4863# error C 'long' size should be either 4 or 8!
4864#endif
4865
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866static Py_ssize_t
4867ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004869 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004870 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004872 /*
4873 * Issue #17237: m68k is a bit different from most architectures in
4874 * that objects do not use "natural alignment" - for example, int and
4875 * long are only aligned at 2-byte boundaries. Therefore the assert()
4876 * won't work; also, tests have shown that skipping the "optimised
4877 * version" will even speed up m68k.
4878 */
4879#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004881 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4882 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883 /* Fast path, see in STRINGLIB(utf8_decode) for
4884 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004885 /* Help allocation */
4886 const char *_p = p;
4887 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 while (_p < aligned_end) {
4889 unsigned long value = *(const unsigned long *) _p;
4890 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 *((unsigned long *)q) = value;
4893 _p += SIZEOF_LONG;
4894 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004895 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 p = _p;
4897 while (p < end) {
4898 if ((unsigned char)*p & 0x80)
4899 break;
4900 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004905#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004906 while (p < end) {
4907 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4908 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004909 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004910 /* Help allocation */
4911 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004913 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 if (value & ASCII_CHAR_MASK)
4915 break;
4916 _p += SIZEOF_LONG;
4917 }
4918 p = _p;
4919 if (_p == end)
4920 break;
4921 }
4922 if ((unsigned char)*p & 0x80)
4923 break;
4924 ++p;
4925 }
4926 memcpy(dest, start, p - start);
4927 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
Antoine Pitrouab868312009-01-10 15:40:25 +00004929
Victor Stinner709d23d2019-05-02 14:56:30 -04004930static PyObject *
4931unicode_decode_utf8(const char *s, Py_ssize_t size,
4932 _Py_error_handler error_handler, const char *errors,
4933 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004934{
Victor Stinner785938e2011-12-11 20:09:03 +01004935 if (size == 0) {
4936 if (consumed)
4937 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004938 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004939 }
4940
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4942 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004943 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004944 *consumed = 1;
4945 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004946 }
4947
Inada Naoki770847a2019-06-24 12:30:24 +09004948 const char *starts = s;
4949 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004950
Inada Naoki770847a2019-06-24 12:30:24 +09004951 // fast path: try ASCII string.
4952 PyObject *u = PyUnicode_New(size, 127);
4953 if (u == NULL) {
4954 return NULL;
4955 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004956 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09004957 if (s == end) {
4958 return u;
4959 }
4960
4961 // Use _PyUnicodeWriter after fast path is failed.
4962 _PyUnicodeWriter writer;
4963 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4964 writer.pos = s - starts;
4965
4966 Py_ssize_t startinpos, endinpos;
4967 const char *errmsg = "";
4968 PyObject *error_handler_obj = NULL;
4969 PyObject *exc = NULL;
4970
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 while (s < end) {
4972 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004973 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004974
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004976 if (PyUnicode_IS_ASCII(writer.buffer))
4977 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004979 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 } else {
4983 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004985 }
4986
4987 switch (ch) {
4988 case 0:
4989 if (s == end || consumed)
4990 goto End;
4991 errmsg = "unexpected end of data";
4992 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004993 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 break;
4995 case 1:
4996 errmsg = "invalid start byte";
4997 startinpos = s - starts;
4998 endinpos = startinpos + 1;
4999 break;
5000 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005001 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5002 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5003 {
5004 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005005 goto End;
5006 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005007 /* fall through */
5008 case 3:
5009 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 errmsg = "invalid continuation byte";
5011 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005012 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 break;
5014 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005015 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005016 goto onError;
5017 continue;
5018 }
5019
Victor Stinner1d65d912015-10-05 13:43:50 +02005020 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005021 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005022
5023 switch (error_handler) {
5024 case _Py_ERROR_IGNORE:
5025 s += (endinpos - startinpos);
5026 break;
5027
5028 case _Py_ERROR_REPLACE:
5029 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5030 goto onError;
5031 s += (endinpos - startinpos);
5032 break;
5033
5034 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005035 {
5036 Py_ssize_t i;
5037
Victor Stinner1d65d912015-10-05 13:43:50 +02005038 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5039 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005040 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005041 ch = (Py_UCS4)(unsigned char)(starts[i]);
5042 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5043 ch + 0xdc00);
5044 writer.pos++;
5045 }
5046 s += (endinpos - startinpos);
5047 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005048 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005049
5050 default:
5051 if (unicode_decode_call_errorhandler_writer(
5052 errors, &error_handler_obj,
5053 "utf-8", errmsg,
5054 &starts, &end, &startinpos, &endinpos, &exc, &s,
5055 &writer))
5056 goto onError;
5057 }
Victor Stinner785938e2011-12-11 20:09:03 +01005058 }
5059
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005061 if (consumed)
5062 *consumed = s - starts;
5063
Victor Stinner1d65d912015-10-05 13:43:50 +02005064 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005065 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005066 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005067
5068onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005069 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005070 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005071 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005073}
5074
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005075
Victor Stinner709d23d2019-05-02 14:56:30 -04005076PyObject *
5077PyUnicode_DecodeUTF8Stateful(const char *s,
5078 Py_ssize_t size,
5079 const char *errors,
5080 Py_ssize_t *consumed)
5081{
5082 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5083}
5084
5085
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005086/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5087 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005088
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005089 On success, write a pointer to a newly allocated wide character string into
5090 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5091 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005092
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005093 On memory allocation failure, return -1.
5094
5095 On decoding error (if surrogateescape is zero), return -2. If wlen is
5096 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5097 is not NULL, write the decoding error message into *reason. */
5098int
5099_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005100 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005101{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005102 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005103 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005104 wchar_t *unicode;
5105 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005106
Victor Stinner3d4226a2018-08-29 22:21:32 +02005107 int surrogateescape = 0;
5108 int surrogatepass = 0;
5109 switch (errors)
5110 {
5111 case _Py_ERROR_STRICT:
5112 break;
5113 case _Py_ERROR_SURROGATEESCAPE:
5114 surrogateescape = 1;
5115 break;
5116 case _Py_ERROR_SURROGATEPASS:
5117 surrogatepass = 1;
5118 break;
5119 default:
5120 return -3;
5121 }
5122
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123 /* Note: size will always be longer than the resulting Unicode
5124 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005125 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005126 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005127 }
5128
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005129 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005130 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005131 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005132 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005133
5134 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005141#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005142 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005143#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144 if (ch > 0xFF) {
5145#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005146 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005148 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005149 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005150 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5151 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5152#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005153 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005154 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005155 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005157 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005158
5159 if (surrogateescape) {
5160 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5161 }
5162 else {
5163 /* Is it a valid three-byte code? */
5164 if (surrogatepass
5165 && (e - s) >= 3
5166 && (s[0] & 0xf0) == 0xe0
5167 && (s[1] & 0xc0) == 0x80
5168 && (s[2] & 0xc0) == 0x80)
5169 {
5170 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5171 s += 3;
5172 unicode[outpos++] = ch;
5173 }
5174 else {
5175 PyMem_RawFree(unicode );
5176 if (reason != NULL) {
5177 switch (ch) {
5178 case 0:
5179 *reason = "unexpected end of data";
5180 break;
5181 case 1:
5182 *reason = "invalid start byte";
5183 break;
5184 /* 2, 3, 4 */
5185 default:
5186 *reason = "invalid continuation byte";
5187 break;
5188 }
5189 }
5190 if (wlen != NULL) {
5191 *wlen = s - orig_s;
5192 }
5193 return -2;
5194 }
5195 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005196 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005197 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005198 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005199 if (wlen) {
5200 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005201 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005202 *wstr = unicode;
5203 return 0;
5204}
5205
Victor Stinner5f9cf232019-03-19 01:46:25 +01005206
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005207wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005208_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5209 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005210{
5211 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005212 int res = _Py_DecodeUTF8Ex(arg, arglen,
5213 &wstr, wlen,
5214 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005215 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005216 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5217 assert(res != -3);
5218 if (wlen) {
5219 *wlen = (size_t)res;
5220 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005221 return NULL;
5222 }
5223 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005224}
5225
Antoine Pitrouab868312009-01-10 15:40:25 +00005226
Victor Stinnere47e6982017-12-21 15:45:16 +01005227/* UTF-8 encoder using the surrogateescape error handler .
5228
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005229 On success, return 0 and write the newly allocated character string (use
5230 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005231
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005232 On encoding failure, return -2 and write the position of the invalid
5233 surrogate character into *error_pos (if error_pos is set) and the decoding
5234 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005235
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005236 On memory allocation failure, return -1. */
5237int
5238_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005239 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005240{
5241 const Py_ssize_t max_char_size = 4;
5242 Py_ssize_t len = wcslen(text);
5243
5244 assert(len >= 0);
5245
Victor Stinner3d4226a2018-08-29 22:21:32 +02005246 int surrogateescape = 0;
5247 int surrogatepass = 0;
5248 switch (errors)
5249 {
5250 case _Py_ERROR_STRICT:
5251 break;
5252 case _Py_ERROR_SURROGATEESCAPE:
5253 surrogateescape = 1;
5254 break;
5255 case _Py_ERROR_SURROGATEPASS:
5256 surrogatepass = 1;
5257 break;
5258 default:
5259 return -3;
5260 }
5261
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005262 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5263 return -1;
5264 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005265 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005266 if (raw_malloc) {
5267 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005268 }
5269 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005271 }
5272 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005273 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005274 }
5275
5276 char *p = bytes;
5277 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005278 for (i = 0; i < len; ) {
5279 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005280 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005281 i++;
5282#if Py_UNICODE_SIZE == 2
5283 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5284 && i < len
5285 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5286 {
5287 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5288 i++;
5289 }
5290#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005291
5292 if (ch < 0x80) {
5293 /* Encode ASCII */
5294 *p++ = (char) ch;
5295
5296 }
5297 else if (ch < 0x0800) {
5298 /* Encode Latin-1 */
5299 *p++ = (char)(0xc0 | (ch >> 6));
5300 *p++ = (char)(0x80 | (ch & 0x3f));
5301 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005302 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005303 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005305 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005306 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005307 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005308 if (reason != NULL) {
5309 *reason = "encoding error";
5310 }
5311 if (raw_malloc) {
5312 PyMem_RawFree(bytes);
5313 }
5314 else {
5315 PyMem_Free(bytes);
5316 }
5317 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005318 }
5319 *p++ = (char)(ch & 0xff);
5320 }
5321 else if (ch < 0x10000) {
5322 *p++ = (char)(0xe0 | (ch >> 12));
5323 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5324 *p++ = (char)(0x80 | (ch & 0x3f));
5325 }
5326 else { /* ch >= 0x10000 */
5327 assert(ch <= MAX_UNICODE);
5328 /* Encode UCS4 Unicode ordinals */
5329 *p++ = (char)(0xf0 | (ch >> 18));
5330 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5331 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5332 *p++ = (char)(0x80 | (ch & 0x3f));
5333 }
5334 }
5335 *p++ = '\0';
5336
5337 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005338 char *bytes2;
5339 if (raw_malloc) {
5340 bytes2 = PyMem_RawRealloc(bytes, final_size);
5341 }
5342 else {
5343 bytes2 = PyMem_Realloc(bytes, final_size);
5344 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005345 if (bytes2 == NULL) {
5346 if (error_pos != NULL) {
5347 *error_pos = (size_t)-1;
5348 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005349 if (raw_malloc) {
5350 PyMem_RawFree(bytes);
5351 }
5352 else {
5353 PyMem_Free(bytes);
5354 }
5355 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005356 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005357 *str = bytes2;
5358 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005359}
5360
5361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005362/* Primary internal function which creates utf8 encoded bytes objects.
5363
5364 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005365 and allocate exactly as much space needed at the end. Else allocate the
5366 maximum possible needed (4 result bytes per Unicode character), and return
5367 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005368*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005369static PyObject *
5370unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5371 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005373 if (!PyUnicode_Check(unicode)) {
5374 PyErr_BadArgument();
5375 return NULL;
5376 }
5377
5378 if (PyUnicode_READY(unicode) == -1)
5379 return NULL;
5380
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005381 if (PyUnicode_UTF8(unicode))
5382 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5383 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005384
Inada Naoki02a4d572020-02-27 13:48:59 +09005385 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005386 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005387 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5388
5389 _PyBytesWriter writer;
5390 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005391
Benjamin Petersonead6b532011-12-20 17:23:42 -06005392 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005393 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005394 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005395 case PyUnicode_1BYTE_KIND:
5396 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5397 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005398 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5399 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005400 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005401 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5402 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005403 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005404 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5405 break;
Tim Peters602f7402002-04-27 18:03:26 +00005406 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005407
5408 if (end == NULL) {
5409 _PyBytesWriter_Dealloc(&writer);
5410 return NULL;
5411 }
5412 return _PyBytesWriter_Finish(&writer, end);
5413}
5414
5415static int
5416unicode_fill_utf8(PyObject *unicode)
5417{
5418 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5419 assert(!PyUnicode_IS_ASCII(unicode));
5420
5421 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005422 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005423 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5424
5425 _PyBytesWriter writer;
5426 char *end;
5427
5428 switch (kind) {
5429 default:
5430 Py_UNREACHABLE();
5431 case PyUnicode_1BYTE_KIND:
5432 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5433 _Py_ERROR_STRICT, NULL);
5434 break;
5435 case PyUnicode_2BYTE_KIND:
5436 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5437 _Py_ERROR_STRICT, NULL);
5438 break;
5439 case PyUnicode_4BYTE_KIND:
5440 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5441 _Py_ERROR_STRICT, NULL);
5442 break;
5443 }
5444 if (end == NULL) {
5445 _PyBytesWriter_Dealloc(&writer);
5446 return -1;
5447 }
5448
5449 char *start = writer.use_small_buffer ? writer.small_buffer :
5450 PyBytes_AS_STRING(writer.buffer);
5451 Py_ssize_t len = end - start;
5452
5453 char *cache = PyObject_MALLOC(len + 1);
5454 if (cache == NULL) {
5455 _PyBytesWriter_Dealloc(&writer);
5456 PyErr_NoMemory();
5457 return -1;
5458 }
5459 _PyUnicode_UTF8(unicode) = cache;
5460 _PyUnicode_UTF8_LENGTH(unicode) = len;
5461 memcpy(cache, start, len);
5462 cache[len] = '\0';
5463 _PyBytesWriter_Dealloc(&writer);
5464 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465}
5466
Alexander Belopolsky40018472011-02-26 01:02:56 +00005467PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005468_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5469{
5470 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5471}
5472
5473
5474PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005475PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5476 Py_ssize_t size,
5477 const char *errors)
5478{
5479 PyObject *v, *unicode;
5480
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005481 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 if (unicode == NULL)
5483 return NULL;
5484 v = _PyUnicode_AsUTF8String(unicode, errors);
5485 Py_DECREF(unicode);
5486 return v;
5487}
5488
5489PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005490PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493}
5494
Walter Dörwald41980ca2007-08-16 21:55:45 +00005495/* --- UTF-32 Codec ------------------------------------------------------- */
5496
5497PyObject *
5498PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 Py_ssize_t size,
5500 const char *errors,
5501 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005502{
5503 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5504}
5505
5506PyObject *
5507PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 Py_ssize_t size,
5509 const char *errors,
5510 int *byteorder,
5511 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005512{
5513 const char *starts = s;
5514 Py_ssize_t startinpos;
5515 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005516 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005517 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005518 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005519 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005521 PyObject *errorHandler = NULL;
5522 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005523
Andy Lestere6be9b52020-02-11 20:28:35 -06005524 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005525 e = q + size;
5526
5527 if (byteorder)
5528 bo = *byteorder;
5529
5530 /* Check for BOM marks (U+FEFF) in the input and adjust current
5531 byte order setting accordingly. In native mode, the leading BOM
5532 mark is skipped, in all other modes, it is copied to the output
5533 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005534 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005535 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005536 if (bom == 0x0000FEFF) {
5537 bo = -1;
5538 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005540 else if (bom == 0xFFFE0000) {
5541 bo = 1;
5542 q += 4;
5543 }
5544 if (byteorder)
5545 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546 }
5547
Victor Stinnere64322e2012-10-30 23:12:47 +01005548 if (q == e) {
5549 if (consumed)
5550 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005551 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005552 }
5553
Victor Stinnere64322e2012-10-30 23:12:47 +01005554#ifdef WORDS_BIGENDIAN
5555 le = bo < 0;
5556#else
5557 le = bo <= 0;
5558#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005559 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005560
Victor Stinner8f674cc2013-04-17 23:02:17 +02005561 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005562 writer.min_length = (e - q + 3) / 4;
5563 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005564 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005565
Victor Stinnere64322e2012-10-30 23:12:47 +01005566 while (1) {
5567 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005568 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005569
Victor Stinnere64322e2012-10-30 23:12:47 +01005570 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 enum PyUnicode_Kind kind = writer.kind;
5572 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005573 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005574 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005575 if (le) {
5576 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005577 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005578 if (ch > maxch)
5579 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 if (kind != PyUnicode_1BYTE_KIND &&
5581 Py_UNICODE_IS_SURROGATE(ch))
5582 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005583 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005584 q += 4;
5585 } while (q <= last);
5586 }
5587 else {
5588 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005589 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005590 if (ch > maxch)
5591 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 if (kind != PyUnicode_1BYTE_KIND &&
5593 Py_UNICODE_IS_SURROGATE(ch))
5594 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005595 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005596 q += 4;
5597 } while (q <= last);
5598 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005599 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005600 }
5601
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005602 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005603 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005604 startinpos = ((const char *)q) - starts;
5605 endinpos = startinpos + 4;
5606 }
5607 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005608 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005610 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005612 startinpos = ((const char *)q) - starts;
5613 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005615 else {
5616 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005617 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005618 goto onError;
5619 q += 4;
5620 continue;
5621 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005622 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005623 startinpos = ((const char *)q) - starts;
5624 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005626
5627 /* The remaining input chars are ignored if the callback
5628 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005631 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005633 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005635 }
5636
Walter Dörwald41980ca2007-08-16 21:55:45 +00005637 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005639
Walter Dörwald41980ca2007-08-16 21:55:45 +00005640 Py_XDECREF(errorHandler);
5641 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005643
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005645 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005646 Py_XDECREF(errorHandler);
5647 Py_XDECREF(exc);
5648 return NULL;
5649}
5650
5651PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005652_PyUnicode_EncodeUTF32(PyObject *str,
5653 const char *errors,
5654 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005655{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005656 enum PyUnicode_Kind kind;
5657 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005658 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005659 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005660 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005661#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005662 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005663#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005664 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005665#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005666 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005667 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005668 PyObject *errorHandler = NULL;
5669 PyObject *exc = NULL;
5670 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005671
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005672 if (!PyUnicode_Check(str)) {
5673 PyErr_BadArgument();
5674 return NULL;
5675 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005676 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005677 return NULL;
5678 kind = PyUnicode_KIND(str);
5679 data = PyUnicode_DATA(str);
5680 len = PyUnicode_GET_LENGTH(str);
5681
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005682 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005683 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005684 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005685 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005686 if (v == NULL)
5687 return NULL;
5688
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005689 /* output buffer is 4-bytes aligned */
5690 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005691 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005692 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005693 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005694 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005695 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005696
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005697 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005698 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005699 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005700 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005701 else
5702 encoding = "utf-32";
5703
5704 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005705 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5706 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005707 }
5708
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005709 pos = 0;
5710 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005711 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005712
5713 if (kind == PyUnicode_2BYTE_KIND) {
5714 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5715 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005716 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005717 else {
5718 assert(kind == PyUnicode_4BYTE_KIND);
5719 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5720 &out, native_ordering);
5721 }
5722 if (pos == len)
5723 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005724
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 rep = unicode_encode_call_errorhandler(
5726 errors, &errorHandler,
5727 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005728 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005729 if (!rep)
5730 goto error;
5731
5732 if (PyBytes_Check(rep)) {
5733 repsize = PyBytes_GET_SIZE(rep);
5734 if (repsize & 3) {
5735 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005736 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005737 "surrogates not allowed");
5738 goto error;
5739 }
5740 moreunits = repsize / 4;
5741 }
5742 else {
5743 assert(PyUnicode_Check(rep));
5744 if (PyUnicode_READY(rep) < 0)
5745 goto error;
5746 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5747 if (!PyUnicode_IS_ASCII(rep)) {
5748 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005749 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005750 "surrogates not allowed");
5751 goto error;
5752 }
5753 }
5754
5755 /* four bytes are reserved for each surrogate */
5756 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005757 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005758 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 /* integer overflow */
5760 PyErr_NoMemory();
5761 goto error;
5762 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005763 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005765 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005766 }
5767
5768 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005769 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005770 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005772 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005773 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5774 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005775 }
5776
5777 Py_CLEAR(rep);
5778 }
5779
5780 /* Cut back to size actually needed. This is necessary for, for example,
5781 encoding of a string containing isolated surrogates and the 'ignore'
5782 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005783 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005784 if (nsize != PyBytes_GET_SIZE(v))
5785 _PyBytes_Resize(&v, nsize);
5786 Py_XDECREF(errorHandler);
5787 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005788 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005789 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005790 error:
5791 Py_XDECREF(rep);
5792 Py_XDECREF(errorHandler);
5793 Py_XDECREF(exc);
5794 Py_XDECREF(v);
5795 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005796}
5797
Alexander Belopolsky40018472011-02-26 01:02:56 +00005798PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005799PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5800 Py_ssize_t size,
5801 const char *errors,
5802 int byteorder)
5803{
5804 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005805 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005806 if (tmp == NULL)
5807 return NULL;
5808 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5809 Py_DECREF(tmp);
5810 return result;
5811}
5812
5813PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005814PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005815{
Victor Stinnerb960b342011-11-20 19:12:52 +01005816 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005817}
5818
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819/* --- UTF-16 Codec ------------------------------------------------------- */
5820
Tim Peters772747b2001-08-09 22:21:55 +00005821PyObject *
5822PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 Py_ssize_t size,
5824 const char *errors,
5825 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826{
Walter Dörwald69652032004-09-07 20:24:22 +00005827 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5828}
5829
5830PyObject *
5831PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 Py_ssize_t size,
5833 const char *errors,
5834 int *byteorder,
5835 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005836{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005837 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005838 Py_ssize_t startinpos;
5839 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005840 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005841 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005842 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005843 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005844 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 PyObject *errorHandler = NULL;
5846 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005847 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848
Andy Lestere6be9b52020-02-11 20:28:35 -06005849 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
5852 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005853 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005855 /* Check for BOM marks (U+FEFF) in the input and adjust current
5856 byte order setting accordingly. In native mode, the leading BOM
5857 mark is skipped, in all other modes, it is copied to the output
5858 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005859 if (bo == 0 && size >= 2) {
5860 const Py_UCS4 bom = (q[1] << 8) | q[0];
5861 if (bom == 0xFEFF) {
5862 q += 2;
5863 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005865 else if (bom == 0xFFFE) {
5866 q += 2;
5867 bo = 1;
5868 }
5869 if (byteorder)
5870 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872
Antoine Pitrou63065d72012-05-15 23:48:04 +02005873 if (q == e) {
5874 if (consumed)
5875 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005876 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005877 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005878
Christian Heimes743e0cd2012-10-17 23:52:17 +02005879#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005880 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005881 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005882#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005883 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005884 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005885#endif
Tim Peters772747b2001-08-09 22:21:55 +00005886
Antoine Pitrou63065d72012-05-15 23:48:04 +02005887 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005888 character count normally. Error handler will take care of
5889 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005890 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005891 writer.min_length = (e - q + 1) / 2;
5892 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005893 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005894
Antoine Pitrou63065d72012-05-15 23:48:04 +02005895 while (1) {
5896 Py_UCS4 ch = 0;
5897 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005898 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005899 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005901 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005902 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005903 native_ordering);
5904 else
5905 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005906 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005907 native_ordering);
5908 } else if (kind == PyUnicode_2BYTE_KIND) {
5909 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005910 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005911 native_ordering);
5912 } else {
5913 assert(kind == PyUnicode_4BYTE_KIND);
5914 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005915 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005916 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005917 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005918 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919
Antoine Pitrou63065d72012-05-15 23:48:04 +02005920 switch (ch)
5921 {
5922 case 0:
5923 /* remaining byte at the end? (size should be even) */
5924 if (q == e || consumed)
5925 goto End;
5926 errmsg = "truncated data";
5927 startinpos = ((const char *)q) - starts;
5928 endinpos = ((const char *)e) - starts;
5929 break;
5930 /* The remaining input chars are ignored if the callback
5931 chooses to skip the input */
5932 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005933 q -= 2;
5934 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005935 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005936 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005937 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005938 endinpos = ((const char *)e) - starts;
5939 break;
5940 case 2:
5941 errmsg = "illegal encoding";
5942 startinpos = ((const char *)q) - 2 - starts;
5943 endinpos = startinpos + 2;
5944 break;
5945 case 3:
5946 errmsg = "illegal UTF-16 surrogate";
5947 startinpos = ((const char *)q) - 4 - starts;
5948 endinpos = startinpos + 2;
5949 break;
5950 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005951 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 continue;
5954 }
5955
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005956 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005957 errors,
5958 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005959 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005960 &starts,
5961 (const char **)&e,
5962 &startinpos,
5963 &endinpos,
5964 &exc,
5965 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005966 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 }
5969
Antoine Pitrou63065d72012-05-15 23:48:04 +02005970End:
Walter Dörwald69652032004-09-07 20:24:22 +00005971 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005973
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974 Py_XDECREF(errorHandler);
5975 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005976 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005979 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 Py_XDECREF(errorHandler);
5981 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 return NULL;
5983}
5984
Tim Peters772747b2001-08-09 22:21:55 +00005985PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005986_PyUnicode_EncodeUTF16(PyObject *str,
5987 const char *errors,
5988 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005990 enum PyUnicode_Kind kind;
5991 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005993 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005994 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005995 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005996#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005997 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005998#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005999 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006000#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 const char *encoding;
6002 Py_ssize_t nsize, pos;
6003 PyObject *errorHandler = NULL;
6004 PyObject *exc = NULL;
6005 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006006
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006007 if (!PyUnicode_Check(str)) {
6008 PyErr_BadArgument();
6009 return NULL;
6010 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006011 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006012 return NULL;
6013 kind = PyUnicode_KIND(str);
6014 data = PyUnicode_DATA(str);
6015 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006016
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006018 if (kind == PyUnicode_4BYTE_KIND) {
6019 const Py_UCS4 *in = (const Py_UCS4 *)data;
6020 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006021 while (in < end) {
6022 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006024 }
6025 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006026 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006027 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006029 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006030 nsize = len + pairs + (byteorder == 0);
6031 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006032 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006036 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006037 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006038 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006039 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006040 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006041 }
6042 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006043 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006044 }
Tim Peters772747b2001-08-09 22:21:55 +00006045
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006046 if (kind == PyUnicode_1BYTE_KIND) {
6047 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6048 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006049 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006050
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006051 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006052 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006053 }
6054 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006055 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006056 }
6057 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006058 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006059 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006060
6061 pos = 0;
6062 while (pos < len) {
6063 Py_ssize_t repsize, moreunits;
6064
6065 if (kind == PyUnicode_2BYTE_KIND) {
6066 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6067 &out, native_ordering);
6068 }
6069 else {
6070 assert(kind == PyUnicode_4BYTE_KIND);
6071 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6072 &out, native_ordering);
6073 }
6074 if (pos == len)
6075 break;
6076
6077 rep = unicode_encode_call_errorhandler(
6078 errors, &errorHandler,
6079 encoding, "surrogates not allowed",
6080 str, &exc, pos, pos + 1, &pos);
6081 if (!rep)
6082 goto error;
6083
6084 if (PyBytes_Check(rep)) {
6085 repsize = PyBytes_GET_SIZE(rep);
6086 if (repsize & 1) {
6087 raise_encode_exception(&exc, encoding,
6088 str, pos - 1, pos,
6089 "surrogates not allowed");
6090 goto error;
6091 }
6092 moreunits = repsize / 2;
6093 }
6094 else {
6095 assert(PyUnicode_Check(rep));
6096 if (PyUnicode_READY(rep) < 0)
6097 goto error;
6098 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6099 if (!PyUnicode_IS_ASCII(rep)) {
6100 raise_encode_exception(&exc, encoding,
6101 str, pos - 1, pos,
6102 "surrogates not allowed");
6103 goto error;
6104 }
6105 }
6106
6107 /* two bytes are reserved for each surrogate */
6108 if (moreunits > 1) {
6109 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006110 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006111 /* integer overflow */
6112 PyErr_NoMemory();
6113 goto error;
6114 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006115 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006116 goto error;
6117 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6118 }
6119
6120 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006121 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006122 out += moreunits;
6123 } else /* rep is unicode */ {
6124 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6125 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6126 &out, native_ordering);
6127 }
6128
6129 Py_CLEAR(rep);
6130 }
6131
6132 /* Cut back to size actually needed. This is necessary for, for example,
6133 encoding of a string containing isolated surrogates and the 'ignore' handler
6134 is used. */
6135 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6136 if (nsize != PyBytes_GET_SIZE(v))
6137 _PyBytes_Resize(&v, nsize);
6138 Py_XDECREF(errorHandler);
6139 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006140 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006141 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006142 error:
6143 Py_XDECREF(rep);
6144 Py_XDECREF(errorHandler);
6145 Py_XDECREF(exc);
6146 Py_XDECREF(v);
6147 return NULL;
6148#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149}
6150
Alexander Belopolsky40018472011-02-26 01:02:56 +00006151PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6153 Py_ssize_t size,
6154 const char *errors,
6155 int byteorder)
6156{
6157 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006158 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 if (tmp == NULL)
6160 return NULL;
6161 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6162 Py_DECREF(tmp);
6163 return result;
6164}
6165
6166PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006167PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170}
6171
6172/* --- Unicode Escape Codec ----------------------------------------------- */
6173
Fredrik Lundh06d12682001-01-24 07:59:11 +00006174static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006175
Alexander Belopolsky40018472011-02-26 01:02:56 +00006176PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006177_PyUnicode_DecodeUnicodeEscape(const char *s,
6178 Py_ssize_t size,
6179 const char *errors,
6180 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006182 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006183 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 PyObject *errorHandler = NULL;
6186 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006187
Eric V. Smith42454af2016-10-31 09:22:08 -04006188 // so we can remember if we've seen an invalid escape char or not
6189 *first_invalid_escape = NULL;
6190
Victor Stinner62ec3312016-09-06 17:04:34 -07006191 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006192 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 }
6194 /* Escaped strings will always be longer than the resulting
6195 Unicode string, so we start with size here and then reduce the
6196 length after conversion to the true value.
6197 (but if the error callback returns a long replacement string
6198 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006199 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006200 writer.min_length = size;
6201 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6202 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006203 }
6204
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 end = s + size;
6206 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006207 unsigned char c = (unsigned char) *s++;
6208 Py_UCS4 ch;
6209 int count;
6210 Py_ssize_t startinpos;
6211 Py_ssize_t endinpos;
6212 const char *message;
6213
6214#define WRITE_ASCII_CHAR(ch) \
6215 do { \
6216 assert(ch <= 127); \
6217 assert(writer.pos < writer.size); \
6218 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6219 } while(0)
6220
6221#define WRITE_CHAR(ch) \
6222 do { \
6223 if (ch <= writer.maxchar) { \
6224 assert(writer.pos < writer.size); \
6225 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6226 } \
6227 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6228 goto onError; \
6229 } \
6230 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231
6232 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006233 if (c != '\\') {
6234 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 continue;
6236 }
6237
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006240 if (s >= end) {
6241 message = "\\ at end of string";
6242 goto error;
6243 }
6244 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006245
Victor Stinner62ec3312016-09-06 17:04:34 -07006246 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006247 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006250 case '\n': continue;
6251 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6252 case '\'': WRITE_ASCII_CHAR('\''); continue;
6253 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6254 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006255 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6257 case 't': WRITE_ASCII_CHAR('\t'); continue;
6258 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6259 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006260 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006262 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006266 case '0': case '1': case '2': case '3':
6267 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006268 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006269 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 ch = (ch<<3) + *s++ - '0';
6271 if (s < end && '0' <= *s && *s <= '7') {
6272 ch = (ch<<3) + *s++ - '0';
6273 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006275 WRITE_CHAR(ch);
6276 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 /* hex escapes */
6279 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006281 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006282 message = "truncated \\xXX escape";
6283 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006288 message = "truncated \\uXXXX escape";
6289 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006292 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006294 message = "truncated \\UXXXXXXXX escape";
6295 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006297 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 ch <<= 4;
6299 if (c >= '0' && c <= '9') {
6300 ch += c - '0';
6301 }
6302 else if (c >= 'a' && c <= 'f') {
6303 ch += c - ('a' - 10);
6304 }
6305 else if (c >= 'A' && c <= 'F') {
6306 ch += c - ('A' - 10);
6307 }
6308 else {
6309 break;
6310 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006311 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006312 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006313 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006314 }
6315
6316 /* when we get here, ch is a 32-bit unicode character */
6317 if (ch > MAX_UNICODE) {
6318 message = "illegal Unicode character";
6319 goto error;
6320 }
6321
6322 WRITE_CHAR(ch);
6323 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006324
Benjamin Peterson29060642009-01-31 22:14:21 +00006325 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006326 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006327 if (ucnhash_CAPI == NULL) {
6328 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006329 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6330 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 if (ucnhash_CAPI == NULL) {
6332 PyErr_SetString(
6333 PyExc_UnicodeError,
6334 "\\N escapes not supported (can't load unicodedata module)"
6335 );
6336 goto onError;
6337 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006338 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006339
6340 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006341 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 const char *start = ++s;
6343 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006344 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006346 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 namelen = s - start;
6348 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006349 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006350 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 ch = 0xffffffff; /* in case 'getcode' messes up */
6352 if (namelen <= INT_MAX &&
6353 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6354 &ch, 0)) {
6355 assert(ch <= MAX_UNICODE);
6356 WRITE_CHAR(ch);
6357 continue;
6358 }
6359 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006360 }
6361 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006362 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006363
6364 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006365 if (*first_invalid_escape == NULL) {
6366 *first_invalid_escape = s-1; /* Back up one char, since we've
6367 already incremented s. */
6368 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006369 WRITE_ASCII_CHAR('\\');
6370 WRITE_CHAR(c);
6371 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006373
6374 error:
6375 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006376 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006377 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006378 errors, &errorHandler,
6379 "unicodeescape", message,
6380 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006382 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006384 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006385
6386#undef WRITE_ASCII_CHAR
6387#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006389
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006390 Py_XDECREF(errorHandler);
6391 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006392 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006393
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006395 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(errorHandler);
6397 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 return NULL;
6399}
6400
Eric V. Smith42454af2016-10-31 09:22:08 -04006401PyObject *
6402PyUnicode_DecodeUnicodeEscape(const char *s,
6403 Py_ssize_t size,
6404 const char *errors)
6405{
6406 const char *first_invalid_escape;
6407 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6408 &first_invalid_escape);
6409 if (result == NULL)
6410 return NULL;
6411 if (first_invalid_escape != NULL) {
6412 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6413 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006414 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006415 Py_DECREF(result);
6416 return NULL;
6417 }
6418 }
6419 return result;
6420}
6421
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006422/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006430 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006431 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433
Ezio Melottie7f90372012-10-05 03:33:31 +03006434 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006435 escape.
6436
Ezio Melottie7f90372012-10-05 03:33:31 +03006437 For UCS1 strings it's '\xxx', 4 bytes per source character.
6438 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6439 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006440 */
6441
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006442 if (!PyUnicode_Check(unicode)) {
6443 PyErr_BadArgument();
6444 return NULL;
6445 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006446 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006447 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 }
Victor Stinner358af132015-10-12 22:36:57 +02006449
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006450 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 if (len == 0) {
6452 return PyBytes_FromStringAndSize(NULL, 0);
6453 }
6454
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455 kind = PyUnicode_KIND(unicode);
6456 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6458 bytes, and 1 byte characters 4. */
6459 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006460 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 return PyErr_NoMemory();
6462 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006463 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 if (repr == NULL) {
6465 return NULL;
6466 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006467
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006469 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006470 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006471
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 /* U+0000-U+00ff range */
6473 if (ch < 0x100) {
6474 if (ch >= ' ' && ch < 127) {
6475 if (ch != '\\') {
6476 /* Copy printable US ASCII as-is */
6477 *p++ = (char) ch;
6478 }
6479 /* Escape backslashes */
6480 else {
6481 *p++ = '\\';
6482 *p++ = '\\';
6483 }
6484 }
Victor Stinner358af132015-10-12 22:36:57 +02006485
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 /* Map special whitespace to '\t', \n', '\r' */
6487 else if (ch == '\t') {
6488 *p++ = '\\';
6489 *p++ = 't';
6490 }
6491 else if (ch == '\n') {
6492 *p++ = '\\';
6493 *p++ = 'n';
6494 }
6495 else if (ch == '\r') {
6496 *p++ = '\\';
6497 *p++ = 'r';
6498 }
6499
6500 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6501 else {
6502 *p++ = '\\';
6503 *p++ = 'x';
6504 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6505 *p++ = Py_hexdigits[ch & 0x000F];
6506 }
Tim Petersced69f82003-09-16 20:30:58 +00006507 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006508 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006509 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 *p++ = '\\';
6511 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006512 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6513 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6514 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6515 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006517 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6518 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006519
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 /* Make sure that the first two digits are zero */
6521 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006522 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 *p++ = 'U';
6524 *p++ = '0';
6525 *p++ = '0';
6526 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6527 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6528 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6529 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6530 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6531 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534
Victor Stinner62ec3312016-09-06 17:04:34 -07006535 assert(p - PyBytes_AS_STRING(repr) > 0);
6536 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6537 return NULL;
6538 }
6539 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540}
6541
Alexander Belopolsky40018472011-02-26 01:02:56 +00006542PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006543PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6544 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006546 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006547 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006548 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006550 }
6551
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006552 result = PyUnicode_AsUnicodeEscapeString(tmp);
6553 Py_DECREF(tmp);
6554 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555}
6556
6557/* --- Raw Unicode Escape Codec ------------------------------------------- */
6558
Alexander Belopolsky40018472011-02-26 01:02:56 +00006559PyObject *
6560PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006561 Py_ssize_t size,
6562 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006565 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006567 PyObject *errorHandler = NULL;
6568 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006569
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006571 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006572 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006573
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 /* Escaped strings will always be longer than the resulting
6575 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006576 length after conversion to the true value. (But decoding error
6577 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006578 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006579 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006580 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6581 goto onError;
6582 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006583
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 end = s + size;
6585 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006586 unsigned char c = (unsigned char) *s++;
6587 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006588 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006589 Py_ssize_t startinpos;
6590 Py_ssize_t endinpos;
6591 const char *message;
6592
6593#define WRITE_CHAR(ch) \
6594 do { \
6595 if (ch <= writer.maxchar) { \
6596 assert(writer.pos < writer.size); \
6597 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6598 } \
6599 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6600 goto onError; \
6601 } \
6602 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006605 if (c != '\\' || s >= end) {
6606 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006608 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006609
Victor Stinner62ec3312016-09-06 17:04:34 -07006610 c = (unsigned char) *s++;
6611 if (c == 'u') {
6612 count = 4;
6613 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006615 else if (c == 'U') {
6616 count = 8;
6617 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006618 }
6619 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006620 assert(writer.pos < writer.size);
6621 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6622 WRITE_CHAR(c);
6623 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006624 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 startinpos = s - starts - 2;
6626
6627 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6628 for (ch = 0; count && s < end; ++s, --count) {
6629 c = (unsigned char)*s;
6630 ch <<= 4;
6631 if (c >= '0' && c <= '9') {
6632 ch += c - '0';
6633 }
6634 else if (c >= 'a' && c <= 'f') {
6635 ch += c - ('a' - 10);
6636 }
6637 else if (c >= 'A' && c <= 'F') {
6638 ch += c - ('A' - 10);
6639 }
6640 else {
6641 break;
6642 }
6643 }
6644 if (!count) {
6645 if (ch <= MAX_UNICODE) {
6646 WRITE_CHAR(ch);
6647 continue;
6648 }
6649 message = "\\Uxxxxxxxx out of range";
6650 }
6651
6652 endinpos = s-starts;
6653 writer.min_length = end - s + writer.pos;
6654 if (unicode_decode_call_errorhandler_writer(
6655 errors, &errorHandler,
6656 "rawunicodeescape", message,
6657 &starts, &end, &startinpos, &endinpos, &exc, &s,
6658 &writer)) {
6659 goto onError;
6660 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006661 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006662
6663#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 Py_XDECREF(errorHandler);
6666 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006667 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006668
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006670 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006671 Py_XDECREF(errorHandler);
6672 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006674
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675}
6676
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006677
Alexander Belopolsky40018472011-02-26 01:02:56 +00006678PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006679PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680{
Victor Stinner62ec3312016-09-06 17:04:34 -07006681 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006684 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006685 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006686 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006688 if (!PyUnicode_Check(unicode)) {
6689 PyErr_BadArgument();
6690 return NULL;
6691 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006692 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006693 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006694 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006695 kind = PyUnicode_KIND(unicode);
6696 data = PyUnicode_DATA(unicode);
6697 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006698 if (kind == PyUnicode_1BYTE_KIND) {
6699 return PyBytes_FromStringAndSize(data, len);
6700 }
Victor Stinner0e368262011-11-10 20:12:49 +01006701
Victor Stinner62ec3312016-09-06 17:04:34 -07006702 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6703 bytes, and 1 byte characters 4. */
6704 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006705
Victor Stinner62ec3312016-09-06 17:04:34 -07006706 if (len > PY_SSIZE_T_MAX / expandsize) {
6707 return PyErr_NoMemory();
6708 }
6709 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6710 if (repr == NULL) {
6711 return NULL;
6712 }
6713 if (len == 0) {
6714 return repr;
6715 }
6716
6717 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006718 for (pos = 0; pos < len; pos++) {
6719 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006720
Victor Stinner62ec3312016-09-06 17:04:34 -07006721 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6722 if (ch < 0x100) {
6723 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006724 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006725 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006726 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 *p++ = '\\';
6728 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006729 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6730 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6731 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6732 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006734 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6735 else {
6736 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6737 *p++ = '\\';
6738 *p++ = 'U';
6739 *p++ = '0';
6740 *p++ = '0';
6741 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6742 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6743 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6744 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6745 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6746 *p++ = Py_hexdigits[ch & 15];
6747 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006749
Victor Stinner62ec3312016-09-06 17:04:34 -07006750 assert(p > PyBytes_AS_STRING(repr));
6751 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6752 return NULL;
6753 }
6754 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755}
6756
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006758PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6759 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006761 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006762 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006763 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006764 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006765 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6766 Py_DECREF(tmp);
6767 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768}
6769
6770/* --- Latin-1 Codec ------------------------------------------------------ */
6771
Alexander Belopolsky40018472011-02-26 01:02:56 +00006772PyObject *
6773PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006774 Py_ssize_t size,
6775 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006778 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779}
6780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006782static void
6783make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006784 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006785 PyObject *unicode,
6786 Py_ssize_t startpos, Py_ssize_t endpos,
6787 const char *reason)
6788{
6789 if (*exceptionObject == NULL) {
6790 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006791 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006792 encoding, unicode, startpos, endpos, reason);
6793 }
6794 else {
6795 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6796 goto onError;
6797 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6798 goto onError;
6799 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6800 goto onError;
6801 return;
6802 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006803 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006804 }
6805}
6806
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006807/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006808static void
6809raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006810 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006811 PyObject *unicode,
6812 Py_ssize_t startpos, Py_ssize_t endpos,
6813 const char *reason)
6814{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006815 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006816 encoding, unicode, startpos, endpos, reason);
6817 if (*exceptionObject != NULL)
6818 PyCodec_StrictErrors(*exceptionObject);
6819}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820
6821/* error handling callback helper:
6822 build arguments, call the callback and check the arguments,
6823 put the result into newpos and return the replacement string, which
6824 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006825static PyObject *
6826unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006827 PyObject **errorHandler,
6828 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006829 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006830 Py_ssize_t startpos, Py_ssize_t endpos,
6831 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006832{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006833 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006834 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835 PyObject *restuple;
6836 PyObject *resunicode;
6837
6838 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842 }
6843
Benjamin Petersonbac79492012-01-14 13:34:47 -05006844 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006845 return NULL;
6846 len = PyUnicode_GET_LENGTH(unicode);
6847
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006848 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006849 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006850 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852
Petr Viktorinffd97532020-02-11 17:46:57 +01006853 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006854 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006856 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006857 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 Py_DECREF(restuple);
6859 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006860 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006861 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 &resunicode, newpos)) {
6863 Py_DECREF(restuple);
6864 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006865 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006866 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6867 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6868 Py_DECREF(restuple);
6869 return NULL;
6870 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006872 *newpos = len + *newpos;
6873 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006874 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 Py_DECREF(restuple);
6876 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006877 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006878 Py_INCREF(resunicode);
6879 Py_DECREF(restuple);
6880 return resunicode;
6881}
6882
Alexander Belopolsky40018472011-02-26 01:02:56 +00006883static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006884unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006885 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006886 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006888 /* input state */
6889 Py_ssize_t pos=0, size;
6890 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006891 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 /* pointer into the output */
6893 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006894 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6895 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006896 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006897 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006898 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006899 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006900 /* output object */
6901 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902
Benjamin Petersonbac79492012-01-14 13:34:47 -05006903 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006904 return NULL;
6905 size = PyUnicode_GET_LENGTH(unicode);
6906 kind = PyUnicode_KIND(unicode);
6907 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 /* allocate enough for a simple encoding without
6909 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006910 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006911 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006912
6913 _PyBytesWriter_Init(&writer);
6914 str = _PyBytesWriter_Alloc(&writer, size);
6915 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006916 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006918 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006919 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006922 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006924 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006925 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006926 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006928 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006931 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006933
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006934 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006936
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006937 /* Only overallocate the buffer if it's not the last write */
6938 writer.overallocate = (collend < size);
6939
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006941 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006942 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006943
6944 switch (error_handler) {
6945 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006946 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006948
6949 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006950 memset(str, '?', collend - collstart);
6951 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006952 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006953 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006954 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 break;
Victor Stinner50149202015-09-22 00:26:54 +02006956
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006957 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006958 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006959 writer.min_size -= (collend - collstart);
6960 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006961 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006962 if (str == NULL)
6963 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006964 pos = collend;
6965 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006966
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006967 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006968 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006969 writer.min_size -= (collend - collstart);
6970 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006971 unicode, collstart, collend);
6972 if (str == NULL)
6973 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006974 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 break;
Victor Stinner50149202015-09-22 00:26:54 +02006976
Victor Stinnerc3713e92015-09-29 12:32:13 +02006977 case _Py_ERROR_SURROGATEESCAPE:
6978 for (i = collstart; i < collend; ++i) {
6979 ch = PyUnicode_READ(kind, data, i);
6980 if (ch < 0xdc80 || 0xdcff < ch) {
6981 /* Not a UTF-8b surrogate */
6982 break;
6983 }
6984 *str++ = (char)(ch - 0xdc00);
6985 ++pos;
6986 }
6987 if (i >= collend)
6988 break;
6989 collstart = pos;
6990 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006991 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006992
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006994 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6995 encoding, reason, unicode, &exc,
6996 collstart, collend, &newpos);
6997 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006998 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006999
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007000 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007001 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007002
Victor Stinner6bd525b2015-10-09 13:10:05 +02007003 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007004 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007005 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007006 PyBytes_AS_STRING(rep),
7007 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007008 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007009 else {
7010 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007011
Victor Stinner6bd525b2015-10-09 13:10:05 +02007012 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007014
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007015 if (limit == 256 ?
7016 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7017 !PyUnicode_IS_ASCII(rep))
7018 {
7019 /* Not all characters are smaller than limit */
7020 raise_encode_exception(&exc, encoding, unicode,
7021 collstart, collend, reason);
7022 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007024 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7025 str = _PyBytesWriter_WriteBytes(&writer, str,
7026 PyUnicode_DATA(rep),
7027 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007029 if (str == NULL)
7030 goto onError;
7031
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007033 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007034 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007035
7036 /* If overallocation was disabled, ensure that it was the last
7037 write. Otherwise, we missed an optimization */
7038 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007039 }
7040 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007041
Victor Stinner50149202015-09-22 00:26:54 +02007042 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007043 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007044 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007045
7046 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007047 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007048 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007049 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007050 Py_XDECREF(exc);
7051 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007052}
7053
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007054/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007055PyObject *
7056PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007057 Py_ssize_t size,
7058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007060 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007061 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007062 if (unicode == NULL)
7063 return NULL;
7064 result = unicode_encode_ucs1(unicode, errors, 256);
7065 Py_DECREF(unicode);
7066 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067}
7068
Alexander Belopolsky40018472011-02-26 01:02:56 +00007069PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007070_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071{
7072 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 PyErr_BadArgument();
7074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007076 if (PyUnicode_READY(unicode) == -1)
7077 return NULL;
7078 /* Fast path: if it is a one-byte string, construct
7079 bytes object directly. */
7080 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7081 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7082 PyUnicode_GET_LENGTH(unicode));
7083 /* Non-Latin-1 characters present. Defer to above function to
7084 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007085 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007086}
7087
7088PyObject*
7089PyUnicode_AsLatin1String(PyObject *unicode)
7090{
7091 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092}
7093
7094/* --- 7-bit ASCII Codec -------------------------------------------------- */
7095
Alexander Belopolsky40018472011-02-26 01:02:56 +00007096PyObject *
7097PyUnicode_DecodeASCII(const char *s,
7098 Py_ssize_t size,
7099 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007101 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007102 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007103 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007104 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007105 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007106
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007108 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007109
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007111 if (size == 1 && (unsigned char)s[0] < 128)
7112 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007113
Inada Naoki770847a2019-06-24 12:30:24 +09007114 // Shortcut for simple case
7115 PyObject *u = PyUnicode_New(size, 127);
7116 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007117 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007118 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007119 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007120 if (outpos == size) {
7121 return u;
7122 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007123
Inada Naoki770847a2019-06-24 12:30:24 +09007124 _PyUnicodeWriter writer;
7125 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007126 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007127
Inada Naoki770847a2019-06-24 12:30:24 +09007128 s += outpos;
7129 int kind = writer.kind;
7130 void *data = writer.data;
7131 Py_ssize_t startinpos, endinpos;
7132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007133 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007134 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007136 PyUnicode_WRITE(kind, data, writer.pos, c);
7137 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007139 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007140 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007141
7142 /* byte outsize range 0x00..0x7f: call the error handler */
7143
7144 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007145 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007146
7147 switch (error_handler)
7148 {
7149 case _Py_ERROR_REPLACE:
7150 case _Py_ERROR_SURROGATEESCAPE:
7151 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007152 but we may switch to UCS2 at the first write */
7153 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7154 goto onError;
7155 kind = writer.kind;
7156 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007157
7158 if (error_handler == _Py_ERROR_REPLACE)
7159 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7160 else
7161 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7162 writer.pos++;
7163 ++s;
7164 break;
7165
7166 case _Py_ERROR_IGNORE:
7167 ++s;
7168 break;
7169
7170 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 startinpos = s-starts;
7172 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007173 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007174 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 "ascii", "ordinal not in range(128)",
7176 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007177 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007179 kind = writer.kind;
7180 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007183 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007185 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007186
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007188 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007189 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007190 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 return NULL;
7192}
7193
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007194/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007195PyObject *
7196PyUnicode_EncodeASCII(const Py_UNICODE *p,
7197 Py_ssize_t size,
7198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007200 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007201 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007202 if (unicode == NULL)
7203 return NULL;
7204 result = unicode_encode_ucs1(unicode, errors, 128);
7205 Py_DECREF(unicode);
7206 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207}
7208
Alexander Belopolsky40018472011-02-26 01:02:56 +00007209PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007210_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211{
7212 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 PyErr_BadArgument();
7214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007216 if (PyUnicode_READY(unicode) == -1)
7217 return NULL;
7218 /* Fast path: if it is an ASCII-only string, construct bytes object
7219 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007220 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007221 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7222 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007223 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224}
7225
7226PyObject *
7227PyUnicode_AsASCIIString(PyObject *unicode)
7228{
7229 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230}
7231
Steve Dowercc16be82016-09-08 10:35:16 -07007232#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007233
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007234/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007235
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007236#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007237#define NEED_RETRY
7238#endif
7239
Steve Dower7ebdda02019-08-21 16:22:33 -07007240/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7241 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7242 both cases also and avoids partial characters overrunning the
7243 length limit in MultiByteToWideChar on Windows */
7244#define DECODING_CHUNK_SIZE (INT_MAX/4)
7245
Victor Stinner3a50e702011-10-18 21:21:00 +02007246#ifndef WC_ERR_INVALID_CHARS
7247# define WC_ERR_INVALID_CHARS 0x0080
7248#endif
7249
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007250static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007251code_page_name(UINT code_page, PyObject **obj)
7252{
7253 *obj = NULL;
7254 if (code_page == CP_ACP)
7255 return "mbcs";
7256 if (code_page == CP_UTF7)
7257 return "CP_UTF7";
7258 if (code_page == CP_UTF8)
7259 return "CP_UTF8";
7260
7261 *obj = PyBytes_FromFormat("cp%u", code_page);
7262 if (*obj == NULL)
7263 return NULL;
7264 return PyBytes_AS_STRING(*obj);
7265}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007266
Victor Stinner3a50e702011-10-18 21:21:00 +02007267static DWORD
7268decode_code_page_flags(UINT code_page)
7269{
7270 if (code_page == CP_UTF7) {
7271 /* The CP_UTF7 decoder only supports flags=0 */
7272 return 0;
7273 }
7274 else
7275 return MB_ERR_INVALID_CHARS;
7276}
7277
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007279 * Decode a byte string from a Windows code page into unicode object in strict
7280 * mode.
7281 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007282 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7283 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007284 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007285static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007286decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007287 wchar_t **buf,
7288 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 const char *in,
7290 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007292 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007293 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007295
7296 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007297 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007298 while ((outsize = MultiByteToWideChar(code_page, flags,
7299 in, insize, NULL, 0)) <= 0)
7300 {
7301 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7302 goto error;
7303 }
7304 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7305 flags = 0;
7306 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007307
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007308 /* Extend a wchar_t* buffer */
7309 Py_ssize_t n = *bufsize; /* Get the current length */
7310 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7311 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007312 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007313 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007314
7315 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7317 if (outsize <= 0)
7318 goto error;
7319 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007320
Victor Stinner3a50e702011-10-18 21:21:00 +02007321error:
7322 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7323 return -2;
7324 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007325 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326}
7327
Victor Stinner3a50e702011-10-18 21:21:00 +02007328/*
7329 * Decode a byte string from a code page into unicode object with an error
7330 * handler.
7331 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007332 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 * UnicodeDecodeError exception and returns -1 on error.
7334 */
7335static int
7336decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007337 wchar_t **buf,
7338 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007339 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007340 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007341{
7342 const char *startin = in;
7343 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007344 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 /* Ideally, we should get reason from FormatMessage. This is the Windows
7346 2000 English version of the message. */
7347 const char *reason = "No mapping for the Unicode character exists "
7348 "in the target code page.";
7349 /* each step cannot decode more than 1 character, but a character can be
7350 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007351 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007352 int insize;
7353 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 PyObject *errorHandler = NULL;
7355 PyObject *exc = NULL;
7356 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007357 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 DWORD err;
7359 int ret = -1;
7360
7361 assert(size > 0);
7362
7363 encoding = code_page_name(code_page, &encoding_obj);
7364 if (encoding == NULL)
7365 return -1;
7366
Victor Stinner7d00cc12014-03-17 23:08:06 +01007367 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7369 UnicodeDecodeError. */
7370 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7371 if (exc != NULL) {
7372 PyCodec_StrictErrors(exc);
7373 Py_CLEAR(exc);
7374 }
7375 goto error;
7376 }
7377
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007378 /* Extend a wchar_t* buffer */
7379 Py_ssize_t n = *bufsize; /* Get the current length */
7380 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7381 PyErr_NoMemory();
7382 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007384 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7385 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007387 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388
7389 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 while (in < endin)
7391 {
7392 /* Decode a character */
7393 insize = 1;
7394 do
7395 {
7396 outsize = MultiByteToWideChar(code_page, flags,
7397 in, insize,
7398 buffer, Py_ARRAY_LENGTH(buffer));
7399 if (outsize > 0)
7400 break;
7401 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007402 if (err == ERROR_INVALID_FLAGS && flags) {
7403 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7404 flags = 0;
7405 continue;
7406 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 if (err != ERROR_NO_UNICODE_TRANSLATION
7408 && err != ERROR_INSUFFICIENT_BUFFER)
7409 {
7410 PyErr_SetFromWindowsErr(0);
7411 goto error;
7412 }
7413 insize++;
7414 }
7415 /* 4=maximum length of a UTF-8 sequence */
7416 while (insize <= 4 && (in + insize) <= endin);
7417
7418 if (outsize <= 0) {
7419 Py_ssize_t startinpos, endinpos, outpos;
7420
Victor Stinner7d00cc12014-03-17 23:08:06 +01007421 /* last character in partial decode? */
7422 if (in + insize >= endin && !final)
7423 break;
7424
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 startinpos = in - startin;
7426 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007427 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007428 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 errors, &errorHandler,
7430 encoding, reason,
7431 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007432 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 {
7434 goto error;
7435 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007436 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007437 }
7438 else {
7439 in += insize;
7440 memcpy(out, buffer, outsize * sizeof(wchar_t));
7441 out += outsize;
7442 }
7443 }
7444
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007445 /* Shrink the buffer */
7446 assert(out - *buf <= *bufsize);
7447 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007448 /* (in - startin) <= size and size is an int */
7449 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007450
7451error:
7452 Py_XDECREF(encoding_obj);
7453 Py_XDECREF(errorHandler);
7454 Py_XDECREF(exc);
7455 return ret;
7456}
7457
Victor Stinner3a50e702011-10-18 21:21:00 +02007458static PyObject *
7459decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007460 const char *s, Py_ssize_t size,
7461 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007463 wchar_t *buf = NULL;
7464 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007465 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007466
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 if (code_page < 0) {
7468 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7469 return NULL;
7470 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007471 if (size < 0) {
7472 PyErr_BadInternalCall();
7473 return NULL;
7474 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007475
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478
Victor Stinner76a31a62011-11-04 00:05:13 +01007479 do
7480 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007482 if (size > DECODING_CHUNK_SIZE) {
7483 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007484 final = 0;
7485 done = 0;
7486 }
7487 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007489 {
7490 chunk_size = (int)size;
7491 final = (consumed == NULL);
7492 done = 1;
7493 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494
Victor Stinner76a31a62011-11-04 00:05:13 +01007495 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007496 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007501 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007502 s, chunk_size);
7503 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007504 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007505 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007506 errors, final);
7507 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007508
7509 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007510 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007511 return NULL;
7512 }
7513
7514 if (consumed)
7515 *consumed += converted;
7516
7517 s += converted;
7518 size -= converted;
7519 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007520
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007521 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7522 PyMem_Free(buf);
7523 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007524}
7525
Alexander Belopolsky40018472011-02-26 01:02:56 +00007526PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007527PyUnicode_DecodeCodePageStateful(int code_page,
7528 const char *s,
7529 Py_ssize_t size,
7530 const char *errors,
7531 Py_ssize_t *consumed)
7532{
7533 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7534}
7535
7536PyObject *
7537PyUnicode_DecodeMBCSStateful(const char *s,
7538 Py_ssize_t size,
7539 const char *errors,
7540 Py_ssize_t *consumed)
7541{
7542 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7543}
7544
7545PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007546PyUnicode_DecodeMBCS(const char *s,
7547 Py_ssize_t size,
7548 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007549{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007550 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7551}
7552
Victor Stinner3a50e702011-10-18 21:21:00 +02007553static DWORD
7554encode_code_page_flags(UINT code_page, const char *errors)
7555{
7556 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007557 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 }
7559 else if (code_page == CP_UTF7) {
7560 /* CP_UTF7 only supports flags=0 */
7561 return 0;
7562 }
7563 else {
7564 if (errors != NULL && strcmp(errors, "replace") == 0)
7565 return 0;
7566 else
7567 return WC_NO_BEST_FIT_CHARS;
7568 }
7569}
7570
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007571/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007572 * Encode a Unicode string to a Windows code page into a byte string in strict
7573 * mode.
7574 *
7575 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007576 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007577 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007578static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007579encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007580 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007582{
Victor Stinner554f3f02010-06-16 23:33:54 +00007583 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 BOOL *pusedDefaultChar = &usedDefaultChar;
7585 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007586 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007587 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 const DWORD flags = encode_code_page_flags(code_page, NULL);
7589 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 /* Create a substring so that we can get the UTF-16 representation
7591 of just the slice under consideration. */
7592 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007593
Martin v. Löwis3d325192011-11-04 18:23:06 +01007594 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007595
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007597 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007599 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007600
Victor Stinner2fc507f2011-11-04 20:06:39 +01007601 substring = PyUnicode_Substring(unicode, offset, offset+len);
7602 if (substring == NULL)
7603 return -1;
7604 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7605 if (p == NULL) {
7606 Py_DECREF(substring);
7607 return -1;
7608 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007609 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007610
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007611 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007613 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 NULL, 0,
7615 NULL, pusedDefaultChar);
7616 if (outsize <= 0)
7617 goto error;
7618 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007619 if (pusedDefaultChar && *pusedDefaultChar) {
7620 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007622 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007623
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007627 if (*outbytes == NULL) {
7628 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007630 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007632 }
7633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007635 const Py_ssize_t n = PyBytes_Size(*outbytes);
7636 if (outsize > PY_SSIZE_T_MAX - n) {
7637 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007638 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7642 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007644 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007646 }
7647
7648 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007650 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 out, outsize,
7652 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007653 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007654 if (outsize <= 0)
7655 goto error;
7656 if (pusedDefaultChar && *pusedDefaultChar)
7657 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007658 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007659
Victor Stinner3a50e702011-10-18 21:21:00 +02007660error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007661 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7663 return -2;
7664 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007665 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007666}
7667
Victor Stinner3a50e702011-10-18 21:21:00 +02007668/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007669 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 * error handler.
7671 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007672 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 * -1 on other error.
7674 */
7675static int
7676encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007677 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007678 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007679{
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007681 Py_ssize_t pos = unicode_offset;
7682 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007683 /* Ideally, we should get reason from FormatMessage. This is the Windows
7684 2000 English version of the message. */
7685 const char *reason = "invalid character";
7686 /* 4=maximum length of a UTF-8 sequence */
7687 char buffer[4];
7688 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7689 Py_ssize_t outsize;
7690 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007691 PyObject *errorHandler = NULL;
7692 PyObject *exc = NULL;
7693 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007694 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007695 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007696 PyObject *rep;
7697 int ret = -1;
7698
7699 assert(insize > 0);
7700
7701 encoding = code_page_name(code_page, &encoding_obj);
7702 if (encoding == NULL)
7703 return -1;
7704
7705 if (errors == NULL || strcmp(errors, "strict") == 0) {
7706 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7707 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007708 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 if (exc != NULL) {
7710 PyCodec_StrictErrors(exc);
7711 Py_DECREF(exc);
7712 }
7713 Py_XDECREF(encoding_obj);
7714 return -1;
7715 }
7716
7717 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7718 pusedDefaultChar = &usedDefaultChar;
7719 else
7720 pusedDefaultChar = NULL;
7721
7722 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7723 PyErr_NoMemory();
7724 goto error;
7725 }
7726 outsize = insize * Py_ARRAY_LENGTH(buffer);
7727
7728 if (*outbytes == NULL) {
7729 /* Create string object */
7730 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7731 if (*outbytes == NULL)
7732 goto error;
7733 out = PyBytes_AS_STRING(*outbytes);
7734 }
7735 else {
7736 /* Extend string object */
7737 Py_ssize_t n = PyBytes_Size(*outbytes);
7738 if (n > PY_SSIZE_T_MAX - outsize) {
7739 PyErr_NoMemory();
7740 goto error;
7741 }
7742 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7743 goto error;
7744 out = PyBytes_AS_STRING(*outbytes) + n;
7745 }
7746
7747 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007749 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007750 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7751 wchar_t chars[2];
7752 int charsize;
7753 if (ch < 0x10000) {
7754 chars[0] = (wchar_t)ch;
7755 charsize = 1;
7756 }
7757 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007758 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7759 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007760 charsize = 2;
7761 }
7762
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007764 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007765 buffer, Py_ARRAY_LENGTH(buffer),
7766 NULL, pusedDefaultChar);
7767 if (outsize > 0) {
7768 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7769 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007770 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007771 memcpy(out, buffer, outsize);
7772 out += outsize;
7773 continue;
7774 }
7775 }
7776 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7777 PyErr_SetFromWindowsErr(0);
7778 goto error;
7779 }
7780
Victor Stinner3a50e702011-10-18 21:21:00 +02007781 rep = unicode_encode_call_errorhandler(
7782 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007785 if (rep == NULL)
7786 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007787 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007788
7789 if (PyBytes_Check(rep)) {
7790 outsize = PyBytes_GET_SIZE(rep);
7791 if (outsize != 1) {
7792 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7793 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7794 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7795 Py_DECREF(rep);
7796 goto error;
7797 }
7798 out = PyBytes_AS_STRING(*outbytes) + offset;
7799 }
7800 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7801 out += outsize;
7802 }
7803 else {
7804 Py_ssize_t i;
7805 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007806 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02007807
Benjamin Petersonbac79492012-01-14 13:34:47 -05007808 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007809 Py_DECREF(rep);
7810 goto error;
7811 }
7812
7813 outsize = PyUnicode_GET_LENGTH(rep);
7814 if (outsize != 1) {
7815 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7816 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7817 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7818 Py_DECREF(rep);
7819 goto error;
7820 }
7821 out = PyBytes_AS_STRING(*outbytes) + offset;
7822 }
7823 kind = PyUnicode_KIND(rep);
7824 data = PyUnicode_DATA(rep);
7825 for (i=0; i < outsize; i++) {
7826 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7827 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007828 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007829 encoding, unicode,
7830 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 "unable to encode error handler result to ASCII");
7832 Py_DECREF(rep);
7833 goto error;
7834 }
7835 *out = (unsigned char)ch;
7836 out++;
7837 }
7838 }
7839 Py_DECREF(rep);
7840 }
7841 /* write a NUL byte */
7842 *out = 0;
7843 outsize = out - PyBytes_AS_STRING(*outbytes);
7844 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7845 if (_PyBytes_Resize(outbytes, outsize) < 0)
7846 goto error;
7847 ret = 0;
7848
7849error:
7850 Py_XDECREF(encoding_obj);
7851 Py_XDECREF(errorHandler);
7852 Py_XDECREF(exc);
7853 return ret;
7854}
7855
Victor Stinner3a50e702011-10-18 21:21:00 +02007856static PyObject *
7857encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007858 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007859 const char *errors)
7860{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007861 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007862 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007863 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007864 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007865
Victor Stinner29dacf22015-01-26 16:41:32 +01007866 if (!PyUnicode_Check(unicode)) {
7867 PyErr_BadArgument();
7868 return NULL;
7869 }
7870
Benjamin Petersonbac79492012-01-14 13:34:47 -05007871 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007872 return NULL;
7873 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007874
Victor Stinner3a50e702011-10-18 21:21:00 +02007875 if (code_page < 0) {
7876 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7877 return NULL;
7878 }
7879
Martin v. Löwis3d325192011-11-04 18:23:06 +01007880 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007881 return PyBytes_FromStringAndSize(NULL, 0);
7882
Victor Stinner7581cef2011-11-03 22:32:33 +01007883 offset = 0;
7884 do
7885 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007886#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007887 if (len > DECODING_CHUNK_SIZE) {
7888 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007889 done = 0;
7890 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007891 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007892#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007893 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007894 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007895 done = 1;
7896 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007897
Victor Stinner76a31a62011-11-04 00:05:13 +01007898 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007899 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007900 errors);
7901 if (ret == -2)
7902 ret = encode_code_page_errors(code_page, &outbytes,
7903 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007904 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007905 if (ret < 0) {
7906 Py_XDECREF(outbytes);
7907 return NULL;
7908 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007909
Victor Stinner7581cef2011-11-03 22:32:33 +01007910 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007911 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007912 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007913
Victor Stinner3a50e702011-10-18 21:21:00 +02007914 return outbytes;
7915}
7916
7917PyObject *
7918PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7919 Py_ssize_t size,
7920 const char *errors)
7921{
Victor Stinner7581cef2011-11-03 22:32:33 +01007922 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007923 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007924 if (unicode == NULL)
7925 return NULL;
7926 res = encode_code_page(CP_ACP, unicode, errors);
7927 Py_DECREF(unicode);
7928 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007929}
7930
7931PyObject *
7932PyUnicode_EncodeCodePage(int code_page,
7933 PyObject *unicode,
7934 const char *errors)
7935{
Victor Stinner7581cef2011-11-03 22:32:33 +01007936 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007937}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007938
Alexander Belopolsky40018472011-02-26 01:02:56 +00007939PyObject *
7940PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007941{
Victor Stinner7581cef2011-11-03 22:32:33 +01007942 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007943}
7944
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007945#undef NEED_RETRY
7946
Steve Dowercc16be82016-09-08 10:35:16 -07007947#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007948
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949/* --- Character Mapping Codec -------------------------------------------- */
7950
Victor Stinnerfb161b12013-04-18 01:44:27 +02007951static int
7952charmap_decode_string(const char *s,
7953 Py_ssize_t size,
7954 PyObject *mapping,
7955 const char *errors,
7956 _PyUnicodeWriter *writer)
7957{
7958 const char *starts = s;
7959 const char *e;
7960 Py_ssize_t startinpos, endinpos;
7961 PyObject *errorHandler = NULL, *exc = NULL;
7962 Py_ssize_t maplen;
7963 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007964 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007965 Py_UCS4 x;
7966 unsigned char ch;
7967
7968 if (PyUnicode_READY(mapping) == -1)
7969 return -1;
7970
7971 maplen = PyUnicode_GET_LENGTH(mapping);
7972 mapdata = PyUnicode_DATA(mapping);
7973 mapkind = PyUnicode_KIND(mapping);
7974
7975 e = s + size;
7976
7977 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7978 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7979 * is disabled in encoding aliases, latin1 is preferred because
7980 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007981 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007982 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7983 Py_UCS4 maxchar = writer->maxchar;
7984
7985 assert (writer->kind == PyUnicode_1BYTE_KIND);
7986 while (s < e) {
7987 ch = *s;
7988 x = mapdata_ucs1[ch];
7989 if (x > maxchar) {
7990 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7991 goto onError;
7992 maxchar = writer->maxchar;
7993 outdata = (Py_UCS1 *)writer->data;
7994 }
7995 outdata[writer->pos] = x;
7996 writer->pos++;
7997 ++s;
7998 }
7999 return 0;
8000 }
8001
8002 while (s < e) {
8003 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8004 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008005 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008006 if (outkind == PyUnicode_1BYTE_KIND) {
8007 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8008 Py_UCS4 maxchar = writer->maxchar;
8009 while (s < e) {
8010 ch = *s;
8011 x = mapdata_ucs2[ch];
8012 if (x > maxchar)
8013 goto Error;
8014 outdata[writer->pos] = x;
8015 writer->pos++;
8016 ++s;
8017 }
8018 break;
8019 }
8020 else if (outkind == PyUnicode_2BYTE_KIND) {
8021 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8022 while (s < e) {
8023 ch = *s;
8024 x = mapdata_ucs2[ch];
8025 if (x == 0xFFFE)
8026 goto Error;
8027 outdata[writer->pos] = x;
8028 writer->pos++;
8029 ++s;
8030 }
8031 break;
8032 }
8033 }
8034 ch = *s;
8035
8036 if (ch < maplen)
8037 x = PyUnicode_READ(mapkind, mapdata, ch);
8038 else
8039 x = 0xfffe; /* invalid value */
8040Error:
8041 if (x == 0xfffe)
8042 {
8043 /* undefined mapping */
8044 startinpos = s-starts;
8045 endinpos = startinpos+1;
8046 if (unicode_decode_call_errorhandler_writer(
8047 errors, &errorHandler,
8048 "charmap", "character maps to <undefined>",
8049 &starts, &e, &startinpos, &endinpos, &exc, &s,
8050 writer)) {
8051 goto onError;
8052 }
8053 continue;
8054 }
8055
8056 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8057 goto onError;
8058 ++s;
8059 }
8060 Py_XDECREF(errorHandler);
8061 Py_XDECREF(exc);
8062 return 0;
8063
8064onError:
8065 Py_XDECREF(errorHandler);
8066 Py_XDECREF(exc);
8067 return -1;
8068}
8069
8070static int
8071charmap_decode_mapping(const char *s,
8072 Py_ssize_t size,
8073 PyObject *mapping,
8074 const char *errors,
8075 _PyUnicodeWriter *writer)
8076{
8077 const char *starts = s;
8078 const char *e;
8079 Py_ssize_t startinpos, endinpos;
8080 PyObject *errorHandler = NULL, *exc = NULL;
8081 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008082 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008083
8084 e = s + size;
8085
8086 while (s < e) {
8087 ch = *s;
8088
8089 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8090 key = PyLong_FromLong((long)ch);
8091 if (key == NULL)
8092 goto onError;
8093
8094 item = PyObject_GetItem(mapping, key);
8095 Py_DECREF(key);
8096 if (item == NULL) {
8097 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8098 /* No mapping found means: mapping is undefined. */
8099 PyErr_Clear();
8100 goto Undefined;
8101 } else
8102 goto onError;
8103 }
8104
8105 /* Apply mapping */
8106 if (item == Py_None)
8107 goto Undefined;
8108 if (PyLong_Check(item)) {
8109 long value = PyLong_AS_LONG(item);
8110 if (value == 0xFFFE)
8111 goto Undefined;
8112 if (value < 0 || value > MAX_UNICODE) {
8113 PyErr_Format(PyExc_TypeError,
8114 "character mapping must be in range(0x%lx)",
8115 (unsigned long)MAX_UNICODE + 1);
8116 goto onError;
8117 }
8118
8119 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8120 goto onError;
8121 }
8122 else if (PyUnicode_Check(item)) {
8123 if (PyUnicode_READY(item) == -1)
8124 goto onError;
8125 if (PyUnicode_GET_LENGTH(item) == 1) {
8126 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8127 if (value == 0xFFFE)
8128 goto Undefined;
8129 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8130 goto onError;
8131 }
8132 else {
8133 writer->overallocate = 1;
8134 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8135 goto onError;
8136 }
8137 }
8138 else {
8139 /* wrong return value */
8140 PyErr_SetString(PyExc_TypeError,
8141 "character mapping must return integer, None or str");
8142 goto onError;
8143 }
8144 Py_CLEAR(item);
8145 ++s;
8146 continue;
8147
8148Undefined:
8149 /* undefined mapping */
8150 Py_CLEAR(item);
8151 startinpos = s-starts;
8152 endinpos = startinpos+1;
8153 if (unicode_decode_call_errorhandler_writer(
8154 errors, &errorHandler,
8155 "charmap", "character maps to <undefined>",
8156 &starts, &e, &startinpos, &endinpos, &exc, &s,
8157 writer)) {
8158 goto onError;
8159 }
8160 }
8161 Py_XDECREF(errorHandler);
8162 Py_XDECREF(exc);
8163 return 0;
8164
8165onError:
8166 Py_XDECREF(item);
8167 Py_XDECREF(errorHandler);
8168 Py_XDECREF(exc);
8169 return -1;
8170}
8171
Alexander Belopolsky40018472011-02-26 01:02:56 +00008172PyObject *
8173PyUnicode_DecodeCharmap(const char *s,
8174 Py_ssize_t size,
8175 PyObject *mapping,
8176 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008178 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008179
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 /* Default to Latin-1 */
8181 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183
Guido van Rossumd57fd912000-03-10 22:53:23 +00008184 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008185 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008186 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008187 writer.min_length = size;
8188 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008190
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008191 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008192 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8193 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008194 }
8195 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008196 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8197 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008199 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008200
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008202 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203 return NULL;
8204}
8205
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008206/* Charmap encoding: the lookup table */
8207
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 PyObject_HEAD
8210 unsigned char level1[32];
8211 int count2, count3;
8212 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213};
8214
8215static PyObject*
8216encoding_map_size(PyObject *obj, PyObject* args)
8217{
8218 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008219 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221}
8222
8223static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 PyDoc_STR("Return the size (in bytes) of this object") },
8226 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227};
8228
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008229static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 "EncodingMap", /*tp_name*/
8232 sizeof(struct encoding_map), /*tp_basicsize*/
8233 0, /*tp_itemsize*/
8234 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008235 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008236 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 0, /*tp_getattr*/
8238 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008239 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 0, /*tp_repr*/
8241 0, /*tp_as_number*/
8242 0, /*tp_as_sequence*/
8243 0, /*tp_as_mapping*/
8244 0, /*tp_hash*/
8245 0, /*tp_call*/
8246 0, /*tp_str*/
8247 0, /*tp_getattro*/
8248 0, /*tp_setattro*/
8249 0, /*tp_as_buffer*/
8250 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8251 0, /*tp_doc*/
8252 0, /*tp_traverse*/
8253 0, /*tp_clear*/
8254 0, /*tp_richcompare*/
8255 0, /*tp_weaklistoffset*/
8256 0, /*tp_iter*/
8257 0, /*tp_iternext*/
8258 encoding_map_methods, /*tp_methods*/
8259 0, /*tp_members*/
8260 0, /*tp_getset*/
8261 0, /*tp_base*/
8262 0, /*tp_dict*/
8263 0, /*tp_descr_get*/
8264 0, /*tp_descr_set*/
8265 0, /*tp_dictoffset*/
8266 0, /*tp_init*/
8267 0, /*tp_alloc*/
8268 0, /*tp_new*/
8269 0, /*tp_free*/
8270 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008271};
8272
8273PyObject*
8274PyUnicode_BuildEncodingMap(PyObject* string)
8275{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008276 PyObject *result;
8277 struct encoding_map *mresult;
8278 int i;
8279 int need_dict = 0;
8280 unsigned char level1[32];
8281 unsigned char level2[512];
8282 unsigned char *mlevel1, *mlevel2, *mlevel3;
8283 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008285 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008286 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008289 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 PyErr_BadArgument();
8291 return NULL;
8292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 kind = PyUnicode_KIND(string);
8294 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008295 length = PyUnicode_GET_LENGTH(string);
8296 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008297 memset(level1, 0xFF, sizeof level1);
8298 memset(level2, 0xFF, sizeof level2);
8299
8300 /* If there isn't a one-to-one mapping of NULL to \0,
8301 or if there are non-BMP characters, we need to use
8302 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008303 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008304 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008305 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008306 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 ch = PyUnicode_READ(kind, data, i);
8308 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309 need_dict = 1;
8310 break;
8311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313 /* unmapped character */
8314 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 l1 = ch >> 11;
8316 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 if (level1[l1] == 0xFF)
8318 level1[l1] = count2++;
8319 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008321 }
8322
8323 if (count2 >= 0xFF || count3 >= 0xFF)
8324 need_dict = 1;
8325
8326 if (need_dict) {
8327 PyObject *result = PyDict_New();
8328 PyObject *key, *value;
8329 if (!result)
8330 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008331 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008333 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 if (!key || !value)
8335 goto failed1;
8336 if (PyDict_SetItem(result, key, value) == -1)
8337 goto failed1;
8338 Py_DECREF(key);
8339 Py_DECREF(value);
8340 }
8341 return result;
8342 failed1:
8343 Py_XDECREF(key);
8344 Py_XDECREF(value);
8345 Py_DECREF(result);
8346 return NULL;
8347 }
8348
8349 /* Create a three-level trie */
8350 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8351 16*count2 + 128*count3 - 1);
8352 if (!result)
8353 return PyErr_NoMemory();
8354 PyObject_Init(result, &EncodingMapType);
8355 mresult = (struct encoding_map*)result;
8356 mresult->count2 = count2;
8357 mresult->count3 = count3;
8358 mlevel1 = mresult->level1;
8359 mlevel2 = mresult->level23;
8360 mlevel3 = mresult->level23 + 16*count2;
8361 memcpy(mlevel1, level1, 32);
8362 memset(mlevel2, 0xFF, 16*count2);
8363 memset(mlevel3, 0, 128*count3);
8364 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008365 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008366 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008367 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8368 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008369 /* unmapped character */
8370 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008371 o1 = ch>>11;
8372 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008373 i2 = 16*mlevel1[o1] + o2;
8374 if (mlevel2[i2] == 0xFF)
8375 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008376 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377 i3 = 128*mlevel2[i2] + o3;
8378 mlevel3[i3] = i;
8379 }
8380 return result;
8381}
8382
8383static int
Victor Stinner22168992011-11-20 17:09:18 +01008384encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385{
8386 struct encoding_map *map = (struct encoding_map*)mapping;
8387 int l1 = c>>11;
8388 int l2 = (c>>7) & 0xF;
8389 int l3 = c & 0x7F;
8390 int i;
8391
Victor Stinner22168992011-11-20 17:09:18 +01008392 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008394 if (c == 0)
8395 return 0;
8396 /* level 1*/
8397 i = map->level1[l1];
8398 if (i == 0xFF) {
8399 return -1;
8400 }
8401 /* level 2*/
8402 i = map->level23[16*i+l2];
8403 if (i == 0xFF) {
8404 return -1;
8405 }
8406 /* level 3 */
8407 i = map->level23[16*map->count2 + 128*i + l3];
8408 if (i == 0) {
8409 return -1;
8410 }
8411 return i;
8412}
8413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414/* Lookup the character ch in the mapping. If the character
8415 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008416 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008417static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008418charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419{
Christian Heimes217cfd12007-12-02 14:31:20 +00008420 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 PyObject *x;
8422
8423 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 x = PyObject_GetItem(mapping, w);
8426 Py_DECREF(w);
8427 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8429 /* No mapping found means: mapping is undefined. */
8430 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008431 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 } else
8433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008435 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008437 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 long value = PyLong_AS_LONG(x);
8439 if (value < 0 || value > 255) {
8440 PyErr_SetString(PyExc_TypeError,
8441 "character mapping must be in range(256)");
8442 Py_DECREF(x);
8443 return NULL;
8444 }
8445 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008447 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 /* wrong return value */
8451 PyErr_Format(PyExc_TypeError,
8452 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008453 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 Py_DECREF(x);
8455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 }
8457}
8458
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008459static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008460charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008461{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8463 /* exponentially overallocate to minimize reallocations */
8464 if (requiredsize < 2*outsize)
8465 requiredsize = 2*outsize;
8466 if (_PyBytes_Resize(outobj, requiredsize))
8467 return -1;
8468 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008469}
8470
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008473} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008475 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 space is available. Return a new reference to the object that
8477 was put in the output buffer, or Py_None, if the mapping was undefined
8478 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008479 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008480static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008481charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008482 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008484 PyObject *rep;
8485 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008486 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487
Andy Lesterdffe4c02020-03-04 07:15:20 -06008488 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008489 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008491 if (res == -1)
8492 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 if (outsize<requiredsize)
8494 if (charmapencode_resize(outobj, outpos, requiredsize))
8495 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008496 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 outstart[(*outpos)++] = (char)res;
8498 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008499 }
8500
8501 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008504 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 Py_DECREF(rep);
8506 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008507 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 if (PyLong_Check(rep)) {
8509 Py_ssize_t requiredsize = *outpos+1;
8510 if (outsize<requiredsize)
8511 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8512 Py_DECREF(rep);
8513 return enc_EXCEPTION;
8514 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008515 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 else {
8519 const char *repchars = PyBytes_AS_STRING(rep);
8520 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8521 Py_ssize_t requiredsize = *outpos+repsize;
8522 if (outsize<requiredsize)
8523 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8524 Py_DECREF(rep);
8525 return enc_EXCEPTION;
8526 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008527 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 memcpy(outstart + *outpos, repchars, repsize);
8529 *outpos += repsize;
8530 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008532 Py_DECREF(rep);
8533 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534}
8535
8536/* handle an error in PyUnicode_EncodeCharmap
8537 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008538static int
8539charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008540 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008542 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008543 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544{
8545 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008546 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008547 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008548 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008549 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008550 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008552 Py_ssize_t collstartpos = *inpos;
8553 Py_ssize_t collendpos = *inpos+1;
8554 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008555 const char *encoding = "charmap";
8556 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008557 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008559 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560
Benjamin Petersonbac79492012-01-14 13:34:47 -05008561 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008562 return -1;
8563 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 /* find all unencodable characters */
8565 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008566 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008567 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008568 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008569 val = encoding_map_lookup(ch, mapping);
8570 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 break;
8572 ++collendpos;
8573 continue;
8574 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008575
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8577 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 if (rep==NULL)
8579 return -1;
8580 else if (rep!=Py_None) {
8581 Py_DECREF(rep);
8582 break;
8583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008584 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586 }
8587 /* cache callback name lookup
8588 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008589 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008590 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008591
8592 switch (*error_handler) {
8593 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008594 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008595 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008596
8597 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008598 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 x = charmapencode_output('?', mapping, res, respos);
8600 if (x==enc_EXCEPTION) {
8601 return -1;
8602 }
8603 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008604 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 return -1;
8606 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008607 }
8608 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008609 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008610 *inpos = collendpos;
8611 break;
Victor Stinner50149202015-09-22 00:26:54 +02008612
8613 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008614 /* generate replacement (temporarily (mis)uses p) */
8615 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 char buffer[2+29+1+1];
8617 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008618 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 for (cp = buffer; *cp; ++cp) {
8620 x = charmapencode_output(*cp, mapping, res, respos);
8621 if (x==enc_EXCEPTION)
8622 return -1;
8623 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008624 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 return -1;
8626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008627 }
8628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008629 *inpos = collendpos;
8630 break;
Victor Stinner50149202015-09-22 00:26:54 +02008631
Benjamin Peterson14339b62009-01-31 16:36:08 +00008632 default:
Victor Stinner50149202015-09-22 00:26:54 +02008633 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008634 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008636 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008638 if (PyBytes_Check(repunicode)) {
8639 /* Directly copy bytes result to output. */
8640 Py_ssize_t outsize = PyBytes_Size(*res);
8641 Py_ssize_t requiredsize;
8642 repsize = PyBytes_Size(repunicode);
8643 requiredsize = *respos + repsize;
8644 if (requiredsize > outsize)
8645 /* Make room for all additional bytes. */
8646 if (charmapencode_resize(res, respos, requiredsize)) {
8647 Py_DECREF(repunicode);
8648 return -1;
8649 }
8650 memcpy(PyBytes_AsString(*res) + *respos,
8651 PyBytes_AsString(repunicode), repsize);
8652 *respos += repsize;
8653 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008654 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008655 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008657 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008658 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008659 Py_DECREF(repunicode);
8660 return -1;
8661 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008662 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008663 data = PyUnicode_DATA(repunicode);
8664 kind = PyUnicode_KIND(repunicode);
8665 for (index = 0; index < repsize; index++) {
8666 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8667 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008669 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return -1;
8671 }
8672 else if (x==enc_FAILED) {
8673 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008674 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return -1;
8676 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008677 }
8678 *inpos = newpos;
8679 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 }
8681 return 0;
8682}
8683
Alexander Belopolsky40018472011-02-26 01:02:56 +00008684PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008685_PyUnicode_EncodeCharmap(PyObject *unicode,
8686 PyObject *mapping,
8687 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 /* output object */
8690 PyObject *res = NULL;
8691 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008693 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008696 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008698 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008699 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008700 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701
Benjamin Petersonbac79492012-01-14 13:34:47 -05008702 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008703 return NULL;
8704 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008705 data = PyUnicode_DATA(unicode);
8706 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008707
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 /* Default to Latin-1 */
8709 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008710 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 /* allocate enough for a simple encoding without
8713 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008714 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 if (res == NULL)
8716 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008717 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008721 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008723 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 if (x==enc_EXCEPTION) /* error */
8725 goto onError;
8726 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008727 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008729 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 &res, &respos)) {
8731 goto onError;
8732 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008733 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 else
8735 /* done with this character => adjust input position */
8736 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008740 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008741 if (_PyBytes_Resize(&res, respos) < 0)
8742 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008745 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 return res;
8747
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749 Py_XDECREF(res);
8750 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008751 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 return NULL;
8753}
8754
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008755/* Deprecated */
8756PyObject *
8757PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8758 Py_ssize_t size,
8759 PyObject *mapping,
8760 const char *errors)
8761{
8762 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008763 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008764 if (unicode == NULL)
8765 return NULL;
8766 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8767 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008768 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008769}
8770
Alexander Belopolsky40018472011-02-26 01:02:56 +00008771PyObject *
8772PyUnicode_AsCharmapString(PyObject *unicode,
8773 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774{
8775 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 PyErr_BadArgument();
8777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008779 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780}
8781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008783static void
8784make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008785 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008786 Py_ssize_t startpos, Py_ssize_t endpos,
8787 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790 *exceptionObject = _PyUnicodeTranslateError_Create(
8791 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 }
8793 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8795 goto onError;
8796 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8797 goto onError;
8798 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8799 goto onError;
8800 return;
8801 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008802 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 }
8804}
8805
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008806/* error handling callback helper:
8807 build arguments, call the callback and check the arguments,
8808 put the result into newpos and return the replacement string, which
8809 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008810static PyObject *
8811unicode_translate_call_errorhandler(const char *errors,
8812 PyObject **errorHandler,
8813 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008815 Py_ssize_t startpos, Py_ssize_t endpos,
8816 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008817{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008818 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008819
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008820 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821 PyObject *restuple;
8822 PyObject *resunicode;
8823
8824 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 }
8829
8830 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008834
Petr Viktorinffd97532020-02-11 17:46:57 +01008835 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008836 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008838 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008839 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 Py_DECREF(restuple);
8841 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008842 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008843 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 &resunicode, &i_newpos)) {
8845 Py_DECREF(restuple);
8846 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008847 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008848 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008850 else
8851 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008853 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 Py_DECREF(restuple);
8855 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008856 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857 Py_INCREF(resunicode);
8858 Py_DECREF(restuple);
8859 return resunicode;
8860}
8861
8862/* Lookup the character ch in the mapping and put the result in result,
8863 which must be decrefed by the caller.
8864 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008865static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008867{
Christian Heimes217cfd12007-12-02 14:31:20 +00008868 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008869 PyObject *x;
8870
8871 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008873 x = PyObject_GetItem(mapping, w);
8874 Py_DECREF(w);
8875 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8877 /* No mapping found means: use 1:1 mapping. */
8878 PyErr_Clear();
8879 *result = NULL;
8880 return 0;
8881 } else
8882 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883 }
8884 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 *result = x;
8886 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008887 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008888 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008890 if (value < 0 || value > MAX_UNICODE) {
8891 PyErr_Format(PyExc_ValueError,
8892 "character mapping must be in range(0x%x)",
8893 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 Py_DECREF(x);
8895 return -1;
8896 }
8897 *result = x;
8898 return 0;
8899 }
8900 else if (PyUnicode_Check(x)) {
8901 *result = x;
8902 return 0;
8903 }
8904 else {
8905 /* wrong return value */
8906 PyErr_SetString(PyExc_TypeError,
8907 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008908 Py_DECREF(x);
8909 return -1;
8910 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008911}
Victor Stinner1194ea02014-04-04 19:37:40 +02008912
8913/* lookup the character, write the result into the writer.
8914 Return 1 if the result was written into the writer, return 0 if the mapping
8915 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008916static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008917charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8918 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008919{
Victor Stinner1194ea02014-04-04 19:37:40 +02008920 PyObject *item;
8921
8922 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008924
8925 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008927 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008930 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008932
8933 if (item == Py_None) {
8934 Py_DECREF(item);
8935 return 0;
8936 }
8937
8938 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008939 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8940 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8941 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008942 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8943 Py_DECREF(item);
8944 return -1;
8945 }
8946 Py_DECREF(item);
8947 return 1;
8948 }
8949
8950 if (!PyUnicode_Check(item)) {
8951 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008953 }
8954
8955 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8956 Py_DECREF(item);
8957 return -1;
8958 }
8959
8960 Py_DECREF(item);
8961 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008962}
8963
Victor Stinner89a76ab2014-04-05 11:44:04 +02008964static int
8965unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8966 Py_UCS1 *translate)
8967{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008968 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 int ret = 0;
8970
Victor Stinner89a76ab2014-04-05 11:44:04 +02008971 if (charmaptranslate_lookup(ch, mapping, &item)) {
8972 return -1;
8973 }
8974
8975 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008976 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008977 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008979 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008980 /* not found => default to 1:1 mapping */
8981 translate[ch] = ch;
8982 return 1;
8983 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008984 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008985 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008986 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8987 used it */
8988 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008989 /* invalid character or character outside ASCII:
8990 skip the fast translate */
8991 goto exit;
8992 }
8993 translate[ch] = (Py_UCS1)replace;
8994 }
8995 else if (PyUnicode_Check(item)) {
8996 Py_UCS4 replace;
8997
8998 if (PyUnicode_READY(item) == -1) {
8999 Py_DECREF(item);
9000 return -1;
9001 }
9002 if (PyUnicode_GET_LENGTH(item) != 1)
9003 goto exit;
9004
9005 replace = PyUnicode_READ_CHAR(item, 0);
9006 if (replace > 127)
9007 goto exit;
9008 translate[ch] = (Py_UCS1)replace;
9009 }
9010 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009011 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 goto exit;
9013 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 ret = 1;
9015
Benjamin Peterson1365de72014-04-07 20:15:41 -04009016 exit:
9017 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009018 return ret;
9019}
9020
9021/* Fast path for ascii => ascii translation. Return 1 if the whole string
9022 was translated into writer, return 0 if the input string was partially
9023 translated into writer, raise an exception and return -1 on error. */
9024static int
9025unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009026 _PyUnicodeWriter *writer, int ignore,
9027 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009028{
Victor Stinner872b2912014-04-05 14:27:07 +02009029 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009030 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009031 const Py_UCS1 *in, *end;
9032 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009033 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009034
Victor Stinner89a76ab2014-04-05 11:44:04 +02009035 len = PyUnicode_GET_LENGTH(input);
9036
Victor Stinner872b2912014-04-05 14:27:07 +02009037 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009038
9039 in = PyUnicode_1BYTE_DATA(input);
9040 end = in + len;
9041
9042 assert(PyUnicode_IS_ASCII(writer->buffer));
9043 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9044 out = PyUnicode_1BYTE_DATA(writer->buffer);
9045
Victor Stinner872b2912014-04-05 14:27:07 +02009046 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009047 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009048 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009049 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009050 int translate = unicode_fast_translate_lookup(mapping, ch,
9051 ascii_table);
9052 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009053 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009054 if (translate == 0)
9055 goto exit;
9056 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009057 }
Victor Stinner872b2912014-04-05 14:27:07 +02009058 if (ch2 == 0xfe) {
9059 if (ignore)
9060 continue;
9061 goto exit;
9062 }
9063 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009064 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009065 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009066 }
Victor Stinner872b2912014-04-05 14:27:07 +02009067 res = 1;
9068
9069exit:
9070 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009071 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009072 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009073}
9074
Victor Stinner3222da22015-10-01 22:07:32 +02009075static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076_PyUnicode_TranslateCharmap(PyObject *input,
9077 PyObject *mapping,
9078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009081 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 Py_ssize_t size, i;
9083 int kind;
9084 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009085 _PyUnicodeWriter writer;
9086 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009087 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009088 PyObject *errorHandler = NULL;
9089 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009090 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009091 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009092
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 PyErr_BadArgument();
9095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 if (PyUnicode_READY(input) == -1)
9099 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009100 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 kind = PyUnicode_KIND(input);
9102 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009104 if (size == 0)
9105 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009107 /* allocate enough for a simple 1:1 translation without
9108 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009109 _PyUnicodeWriter_Init(&writer);
9110 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112
Victor Stinner872b2912014-04-05 14:27:07 +02009113 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9114
Victor Stinner33798672016-03-01 21:59:58 +01009115 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009116 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009117 if (PyUnicode_IS_ASCII(input)) {
9118 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9119 if (res < 0) {
9120 _PyUnicodeWriter_Dealloc(&writer);
9121 return NULL;
9122 }
9123 if (res == 1)
9124 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009125 }
Victor Stinner33798672016-03-01 21:59:58 +01009126 else {
9127 i = 0;
9128 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009132 int translate;
9133 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9134 Py_ssize_t newpos;
9135 /* startpos for collecting untranslatable chars */
9136 Py_ssize_t collstart;
9137 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009138 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139
Victor Stinner1194ea02014-04-04 19:37:40 +02009140 ch = PyUnicode_READ(kind, data, i);
9141 translate = charmaptranslate_output(ch, mapping, &writer);
9142 if (translate < 0)
9143 goto onError;
9144
9145 if (translate != 0) {
9146 /* it worked => adjust input pointer */
9147 ++i;
9148 continue;
9149 }
9150
9151 /* untranslatable character */
9152 collstart = i;
9153 collend = i+1;
9154
9155 /* find all untranslatable characters */
9156 while (collend < size) {
9157 PyObject *x;
9158 ch = PyUnicode_READ(kind, data, collend);
9159 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009160 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009161 Py_XDECREF(x);
9162 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009164 ++collend;
9165 }
9166
9167 if (ignore) {
9168 i = collend;
9169 }
9170 else {
9171 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9172 reason, input, &exc,
9173 collstart, collend, &newpos);
9174 if (repunicode == NULL)
9175 goto onError;
9176 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009178 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009179 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009180 Py_DECREF(repunicode);
9181 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009182 }
9183 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009184 Py_XDECREF(exc);
9185 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009186 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009189 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009190 Py_XDECREF(exc);
9191 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 return NULL;
9193}
9194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195/* Deprecated. Use PyUnicode_Translate instead. */
9196PyObject *
9197PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9198 Py_ssize_t size,
9199 PyObject *mapping,
9200 const char *errors)
9201{
Christian Heimes5f520f42012-09-11 14:03:25 +02009202 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009203 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 if (!unicode)
9205 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009206 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9207 Py_DECREF(unicode);
9208 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209}
9210
Alexander Belopolsky40018472011-02-26 01:02:56 +00009211PyObject *
9212PyUnicode_Translate(PyObject *str,
9213 PyObject *mapping,
9214 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009216 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009217 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009218 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219}
Tim Petersced69f82003-09-16 20:30:58 +00009220
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221PyObject *
9222_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9223{
9224 if (!PyUnicode_Check(unicode)) {
9225 PyErr_BadInternalCall();
9226 return NULL;
9227 }
9228 if (PyUnicode_READY(unicode) == -1)
9229 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009230 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 /* If the string is already ASCII, just return the same string */
9232 Py_INCREF(unicode);
9233 return unicode;
9234 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009235
9236 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9237 PyObject *result = PyUnicode_New(len, 127);
9238 if (result == NULL) {
9239 return NULL;
9240 }
9241
9242 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9243 int kind = PyUnicode_KIND(unicode);
9244 const void *data = PyUnicode_DATA(unicode);
9245 Py_ssize_t i;
9246 for (i = 0; i < len; ++i) {
9247 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9248 if (ch < 127) {
9249 out[i] = ch;
9250 }
9251 else if (Py_UNICODE_ISSPACE(ch)) {
9252 out[i] = ' ';
9253 }
9254 else {
9255 int decimal = Py_UNICODE_TODECIMAL(ch);
9256 if (decimal < 0) {
9257 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009258 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009259 _PyUnicode_LENGTH(result) = i + 1;
9260 break;
9261 }
9262 out[i] = '0' + decimal;
9263 }
9264 }
9265
INADA Naoki16dfca42018-07-14 12:06:43 +09009266 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009267 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268}
9269
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009270PyObject *
9271PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9272 Py_ssize_t length)
9273{
Victor Stinnerf0124502011-11-21 23:12:56 +01009274 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009275 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009276 Py_UCS4 maxchar;
9277 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009278 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009279
Victor Stinner99d7ad02012-02-22 13:37:39 +01009280 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009281 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009282 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009283 if (ch > 127) {
9284 int decimal = Py_UNICODE_TODECIMAL(ch);
9285 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009286 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009287 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009288 }
9289 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009290
9291 /* Copy to a new string */
9292 decimal = PyUnicode_New(length, maxchar);
9293 if (decimal == NULL)
9294 return decimal;
9295 kind = PyUnicode_KIND(decimal);
9296 data = PyUnicode_DATA(decimal);
9297 /* Iterate over code points */
9298 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009299 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009300 if (ch > 127) {
9301 int decimal = Py_UNICODE_TODECIMAL(ch);
9302 if (decimal >= 0)
9303 ch = '0' + decimal;
9304 }
9305 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009307 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009308}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009309/* --- Decimal Encoder ---------------------------------------------------- */
9310
Alexander Belopolsky40018472011-02-26 01:02:56 +00009311int
9312PyUnicode_EncodeDecimal(Py_UNICODE *s,
9313 Py_ssize_t length,
9314 char *output,
9315 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009316{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009317 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009318 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009319 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009320 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009321
9322 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 PyErr_BadArgument();
9324 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009325 }
9326
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009327 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009328 if (unicode == NULL)
9329 return -1;
9330
Victor Stinner42bf7752011-11-21 22:52:58 +01009331 kind = PyUnicode_KIND(unicode);
9332 data = PyUnicode_DATA(unicode);
9333
Victor Stinnerb84d7232011-11-22 01:50:07 +01009334 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009335 PyObject *exc;
9336 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009338 Py_ssize_t startpos;
9339
9340 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009341
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009343 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009344 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 decimal = Py_UNICODE_TODECIMAL(ch);
9348 if (decimal >= 0) {
9349 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009350 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 continue;
9352 }
9353 if (0 < ch && ch < 256) {
9354 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009355 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 continue;
9357 }
Victor Stinner6345be92011-11-25 20:09:01 +01009358
Victor Stinner42bf7752011-11-21 22:52:58 +01009359 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009360 exc = NULL;
9361 raise_encode_exception(&exc, "decimal", unicode,
9362 startpos, startpos+1,
9363 "invalid decimal Unicode string");
9364 Py_XDECREF(exc);
9365 Py_DECREF(unicode);
9366 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009367 }
9368 /* 0-terminate the output string */
9369 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009370 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009371 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009372}
9373
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374/* --- Helpers ------------------------------------------------------------ */
9375
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009376/* helper macro to fixup start/end slice values */
9377#define ADJUST_INDICES(start, end, len) \
9378 if (end > len) \
9379 end = len; \
9380 else if (end < 0) { \
9381 end += len; \
9382 if (end < 0) \
9383 end = 0; \
9384 } \
9385 if (start < 0) { \
9386 start += len; \
9387 if (start < 0) \
9388 start = 0; \
9389 }
9390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009392any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009394 Py_ssize_t end,
9395 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009397 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009398 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 Py_ssize_t len1, len2, result;
9400
9401 kind1 = PyUnicode_KIND(s1);
9402 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009403 if (kind1 < kind2)
9404 return -1;
9405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 len1 = PyUnicode_GET_LENGTH(s1);
9407 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009408 ADJUST_INDICES(start, end, len1);
9409 if (end - start < len2)
9410 return -1;
9411
9412 buf1 = PyUnicode_DATA(s1);
9413 buf2 = PyUnicode_DATA(s2);
9414 if (len2 == 1) {
9415 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9416 result = findchar((const char *)buf1 + kind1*start,
9417 kind1, end - start, ch, direction);
9418 if (result == -1)
9419 return -1;
9420 else
9421 return start + result;
9422 }
9423
9424 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009425 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 if (!buf2)
9427 return -2;
9428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429
Victor Stinner794d5672011-10-10 03:21:36 +02009430 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009431 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009432 case PyUnicode_1BYTE_KIND:
9433 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9434 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9435 else
9436 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9437 break;
9438 case PyUnicode_2BYTE_KIND:
9439 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9440 break;
9441 case PyUnicode_4BYTE_KIND:
9442 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9443 break;
9444 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009445 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009446 }
9447 }
9448 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009449 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009450 case PyUnicode_1BYTE_KIND:
9451 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9452 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9453 else
9454 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9455 break;
9456 case PyUnicode_2BYTE_KIND:
9457 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9458 break;
9459 case PyUnicode_4BYTE_KIND:
9460 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9461 break;
9462 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009463 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 }
9466
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009467 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009468 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009469 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470
9471 return result;
9472}
9473
Victor Stinner59423e32018-11-26 13:40:01 +01009474/* _PyUnicode_InsertThousandsGrouping() helper functions */
9475#include "stringlib/localeutil.h"
9476
9477/**
9478 * InsertThousandsGrouping:
9479 * @writer: Unicode writer.
9480 * @n_buffer: Number of characters in @buffer.
9481 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9482 * @d_pos: Start of digits string.
9483 * @n_digits: The number of digits in the string, in which we want
9484 * to put the grouping chars.
9485 * @min_width: The minimum width of the digits in the output string.
9486 * Output will be zero-padded on the left to fill.
9487 * @grouping: see definition in localeconv().
9488 * @thousands_sep: see definition in localeconv().
9489 *
9490 * There are 2 modes: counting and filling. If @writer is NULL,
9491 * we are in counting mode, else filling mode.
9492 * If counting, the required buffer size is returned.
9493 * If filling, we know the buffer will be large enough, so we don't
9494 * need to pass in the buffer size.
9495 * Inserts thousand grouping characters (as defined by grouping and
9496 * thousands_sep) into @writer.
9497 *
9498 * Return value: -1 on error, number of characters otherwise.
9499 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009501_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009502 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009503 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009504 PyObject *digits,
9505 Py_ssize_t d_pos,
9506 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009507 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009508 const char *grouping,
9509 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009510 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511{
Xtreak3f7983a2019-01-07 20:39:14 +05309512 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009513 if (writer) {
9514 assert(digits != NULL);
9515 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009516 }
9517 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009518 assert(digits == NULL);
9519 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009520 }
Victor Stinner59423e32018-11-26 13:40:01 +01009521 assert(0 <= d_pos);
9522 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009523 assert(grouping != NULL);
9524
9525 if (digits != NULL) {
9526 if (PyUnicode_READY(digits) == -1) {
9527 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009528 }
Victor Stinner59423e32018-11-26 13:40:01 +01009529 }
9530 if (PyUnicode_READY(thousands_sep) == -1) {
9531 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009532 }
9533
Victor Stinner59423e32018-11-26 13:40:01 +01009534 Py_ssize_t count = 0;
9535 Py_ssize_t n_zeros;
9536 int loop_broken = 0;
9537 int use_separator = 0; /* First time through, don't append the
9538 separator. They only go between
9539 groups. */
9540 Py_ssize_t buffer_pos;
9541 Py_ssize_t digits_pos;
9542 Py_ssize_t len;
9543 Py_ssize_t n_chars;
9544 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9545 be looked at */
9546 /* A generator that returns all of the grouping widths, until it
9547 returns 0. */
9548 GroupGenerator groupgen;
9549 GroupGenerator_init(&groupgen, grouping);
9550 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9551
9552 /* if digits are not grouped, thousands separator
9553 should be an empty string */
9554 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9555
9556 digits_pos = d_pos + n_digits;
9557 if (writer) {
9558 buffer_pos = writer->pos + n_buffer;
9559 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9560 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 }
Victor Stinner59423e32018-11-26 13:40:01 +01009562 else {
9563 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009564 }
Victor Stinner59423e32018-11-26 13:40:01 +01009565
9566 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009567 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009568 }
Victor Stinner59423e32018-11-26 13:40:01 +01009569
9570 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9571 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9572 n_zeros = Py_MAX(0, len - remaining);
9573 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9574
9575 /* Use n_zero zero's and n_chars chars */
9576
9577 /* Count only, don't do anything. */
9578 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9579
9580 /* Copy into the writer. */
9581 InsertThousandsGrouping_fill(writer, &buffer_pos,
9582 digits, &digits_pos,
9583 n_chars, n_zeros,
9584 use_separator ? thousands_sep : NULL,
9585 thousands_sep_len, maxchar);
9586
9587 /* Use a separator next time. */
9588 use_separator = 1;
9589
9590 remaining -= n_chars;
9591 min_width -= len;
9592
9593 if (remaining <= 0 && min_width <= 0) {
9594 loop_broken = 1;
9595 break;
9596 }
9597 min_width -= thousands_sep_len;
9598 }
9599 if (!loop_broken) {
9600 /* We left the loop without using a break statement. */
9601
9602 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9603 n_zeros = Py_MAX(0, len - remaining);
9604 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9605
9606 /* Use n_zero zero's and n_chars chars */
9607 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9608
9609 /* Copy into the writer. */
9610 InsertThousandsGrouping_fill(writer, &buffer_pos,
9611 digits, &digits_pos,
9612 n_chars, n_zeros,
9613 use_separator ? thousands_sep : NULL,
9614 thousands_sep_len, maxchar);
9615 }
9616 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617}
9618
9619
Alexander Belopolsky40018472011-02-26 01:02:56 +00009620Py_ssize_t
9621PyUnicode_Count(PyObject *str,
9622 PyObject *substr,
9623 Py_ssize_t start,
9624 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009626 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009627 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009628 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009630
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009631 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009633
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009634 kind1 = PyUnicode_KIND(str);
9635 kind2 = PyUnicode_KIND(substr);
9636 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009637 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009638
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009639 len1 = PyUnicode_GET_LENGTH(str);
9640 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009642 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009643 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009644
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009645 buf1 = PyUnicode_DATA(str);
9646 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009647 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009648 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009649 if (!buf2)
9650 goto onError;
9651 }
9652
9653 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009655 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009656 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009657 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009658 buf2, len2, PY_SSIZE_T_MAX
9659 );
9660 else
9661 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009662 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009663 buf2, len2, PY_SSIZE_T_MAX
9664 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 break;
9666 case PyUnicode_2BYTE_KIND:
9667 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009668 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 buf2, len2, PY_SSIZE_T_MAX
9670 );
9671 break;
9672 case PyUnicode_4BYTE_KIND:
9673 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009674 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 buf2, len2, PY_SSIZE_T_MAX
9676 );
9677 break;
9678 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009679 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009681
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009682 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009683 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009684 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009688 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9689 if (kind2 != kind1)
9690 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692}
9693
Alexander Belopolsky40018472011-02-26 01:02:56 +00009694Py_ssize_t
9695PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009696 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009697 Py_ssize_t start,
9698 Py_ssize_t end,
9699 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009701 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009703
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009704 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705}
9706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707Py_ssize_t
9708PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9709 Py_ssize_t start, Py_ssize_t end,
9710 int direction)
9711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009713 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 if (PyUnicode_READY(str) == -1)
9715 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009716 len = PyUnicode_GET_LENGTH(str);
9717 ADJUST_INDICES(start, end, len);
9718 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009719 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009721 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9722 kind, end-start, ch, direction);
9723 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009725 else
9726 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727}
9728
Alexander Belopolsky40018472011-02-26 01:02:56 +00009729static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009730tailmatch(PyObject *self,
9731 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009732 Py_ssize_t start,
9733 Py_ssize_t end,
9734 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 int kind_self;
9737 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009738 const void *data_self;
9739 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 Py_ssize_t offset;
9741 Py_ssize_t i;
9742 Py_ssize_t end_sub;
9743
9744 if (PyUnicode_READY(self) == -1 ||
9745 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009746 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9749 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009753 if (PyUnicode_GET_LENGTH(substring) == 0)
9754 return 1;
9755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 kind_self = PyUnicode_KIND(self);
9757 data_self = PyUnicode_DATA(self);
9758 kind_sub = PyUnicode_KIND(substring);
9759 data_sub = PyUnicode_DATA(substring);
9760 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9761
9762 if (direction > 0)
9763 offset = end;
9764 else
9765 offset = start;
9766
9767 if (PyUnicode_READ(kind_self, data_self, offset) ==
9768 PyUnicode_READ(kind_sub, data_sub, 0) &&
9769 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9770 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9771 /* If both are of the same kind, memcmp is sufficient */
9772 if (kind_self == kind_sub) {
9773 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009774 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 data_sub,
9776 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009777 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009779 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 else {
9781 /* We do not need to compare 0 and len(substring)-1 because
9782 the if statement above ensured already that they are equal
9783 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 for (i = 1; i < end_sub; ++i) {
9785 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9786 PyUnicode_READ(kind_sub, data_sub, i))
9787 return 0;
9788 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791 }
9792
9793 return 0;
9794}
9795
Alexander Belopolsky40018472011-02-26 01:02:56 +00009796Py_ssize_t
9797PyUnicode_Tailmatch(PyObject *str,
9798 PyObject *substr,
9799 Py_ssize_t start,
9800 Py_ssize_t end,
9801 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009803 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009804 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009805
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009806 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807}
9808
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009809static PyObject *
9810ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009813 const char *data = PyUnicode_DATA(self);
9814 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009815 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009816
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009817 res = PyUnicode_New(len, 127);
9818 if (res == NULL)
9819 return NULL;
9820 resdata = PyUnicode_DATA(res);
9821 if (lower)
9822 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009824 _Py_bytes_upper(resdata, data, len);
9825 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826}
9827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009829handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009831 Py_ssize_t j;
9832 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009833 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009834 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009835
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009836 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9837
9838 where ! is a negation and \p{xxx} is a character with property xxx.
9839 */
9840 for (j = i - 1; j >= 0; j--) {
9841 c = PyUnicode_READ(kind, data, j);
9842 if (!_PyUnicode_IsCaseIgnorable(c))
9843 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009845 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9846 if (final_sigma) {
9847 for (j = i + 1; j < length; j++) {
9848 c = PyUnicode_READ(kind, data, j);
9849 if (!_PyUnicode_IsCaseIgnorable(c))
9850 break;
9851 }
9852 final_sigma = j == length || !_PyUnicode_IsCased(c);
9853 }
9854 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855}
9856
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009857static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009858lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009859 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861 /* Obscure special case. */
9862 if (c == 0x3A3) {
9863 mapped[0] = handle_capital_sigma(kind, data, length, i);
9864 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009866 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867}
9868
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009869static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009870do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872 Py_ssize_t i, k = 0;
9873 int n_res, j;
9874 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009875
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009876 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009877 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009879 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009880 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009881 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009882 for (i = 1; i < length; i++) {
9883 c = PyUnicode_READ(kind, data, i);
9884 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9885 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009886 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009887 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009888 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009889 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009890 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891}
9892
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009893static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009894do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895 Py_ssize_t i, k = 0;
9896
9897 for (i = 0; i < length; i++) {
9898 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9899 int n_res, j;
9900 if (Py_UNICODE_ISUPPER(c)) {
9901 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9902 }
9903 else if (Py_UNICODE_ISLOWER(c)) {
9904 n_res = _PyUnicode_ToUpperFull(c, mapped);
9905 }
9906 else {
9907 n_res = 1;
9908 mapped[0] = c;
9909 }
9910 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009911 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009912 res[k++] = mapped[j];
9913 }
9914 }
9915 return k;
9916}
9917
9918static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009919do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009920 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009922 Py_ssize_t i, k = 0;
9923
9924 for (i = 0; i < length; i++) {
9925 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9926 int n_res, j;
9927 if (lower)
9928 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9929 else
9930 n_res = _PyUnicode_ToUpperFull(c, mapped);
9931 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009932 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009933 res[k++] = mapped[j];
9934 }
9935 }
9936 return k;
9937}
9938
9939static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009940do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009941{
9942 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9943}
9944
9945static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009946do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009947{
9948 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9949}
9950
Benjamin Petersone51757f2012-01-12 21:10:29 -05009951static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009952do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -05009953{
9954 Py_ssize_t i, k = 0;
9955
9956 for (i = 0; i < length; i++) {
9957 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9958 Py_UCS4 mapped[3];
9959 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9960 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009961 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009962 res[k++] = mapped[j];
9963 }
9964 }
9965 return k;
9966}
9967
9968static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009969do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -05009970{
9971 Py_ssize_t i, k = 0;
9972 int previous_is_cased;
9973
9974 previous_is_cased = 0;
9975 for (i = 0; i < length; i++) {
9976 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9977 Py_UCS4 mapped[3];
9978 int n_res, j;
9979
9980 if (previous_is_cased)
9981 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9982 else
9983 n_res = _PyUnicode_ToTitleFull(c, mapped);
9984
9985 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009986 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009987 res[k++] = mapped[j];
9988 }
9989
9990 previous_is_cased = _PyUnicode_IsCased(c);
9991 }
9992 return k;
9993}
9994
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009995static PyObject *
9996case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009997 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009998{
9999 PyObject *res = NULL;
10000 Py_ssize_t length, newlength = 0;
10001 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010002 const void *data;
10003 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010004 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10005
Benjamin Petersoneea48462012-01-16 14:28:50 -050010006 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010007
10008 kind = PyUnicode_KIND(self);
10009 data = PyUnicode_DATA(self);
10010 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010011 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010012 PyErr_SetString(PyExc_OverflowError, "string is too long");
10013 return NULL;
10014 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010015 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010016 if (tmp == NULL)
10017 return PyErr_NoMemory();
10018 newlength = perform(kind, data, length, tmp, &maxchar);
10019 res = PyUnicode_New(newlength, maxchar);
10020 if (res == NULL)
10021 goto leave;
10022 tmpend = tmp + newlength;
10023 outdata = PyUnicode_DATA(res);
10024 outkind = PyUnicode_KIND(res);
10025 switch (outkind) {
10026 case PyUnicode_1BYTE_KIND:
10027 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10028 break;
10029 case PyUnicode_2BYTE_KIND:
10030 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10031 break;
10032 case PyUnicode_4BYTE_KIND:
10033 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10034 break;
10035 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010036 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010037 }
10038 leave:
10039 PyMem_FREE(tmp);
10040 return res;
10041}
10042
Tim Peters8ce9f162004-08-27 01:49:32 +000010043PyObject *
10044PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010046 PyObject *res;
10047 PyObject *fseq;
10048 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010049 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010051 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010052 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010053 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010054 }
10055
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010056 /* NOTE: the following code can't call back into Python code,
10057 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010058 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010059
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010060 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010061 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010062 res = _PyUnicode_JoinArray(separator, items, seqlen);
10063 Py_DECREF(fseq);
10064 return res;
10065}
10066
10067PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010068_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010069{
10070 PyObject *res = NULL; /* the result */
10071 PyObject *sep = NULL;
10072 Py_ssize_t seplen;
10073 PyObject *item;
10074 Py_ssize_t sz, i, res_offset;
10075 Py_UCS4 maxchar;
10076 Py_UCS4 item_maxchar;
10077 int use_memcpy;
10078 unsigned char *res_data = NULL, *sep_data = NULL;
10079 PyObject *last_obj;
10080 unsigned int kind = 0;
10081
Tim Peters05eba1f2004-08-27 21:32:02 +000010082 /* If empty sequence, return u"". */
10083 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010084 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010085 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010086
Tim Peters05eba1f2004-08-27 21:32:02 +000010087 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010088 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010089 if (seqlen == 1) {
10090 if (PyUnicode_CheckExact(items[0])) {
10091 res = items[0];
10092 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010093 return res;
10094 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010095 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010096 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010097 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010098 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010099 /* Set up sep and seplen */
10100 if (separator == NULL) {
10101 /* fall back to a blank space separator */
10102 sep = PyUnicode_FromOrdinal(' ');
10103 if (!sep)
10104 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010105 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010106 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010107 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010108 else {
10109 if (!PyUnicode_Check(separator)) {
10110 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010111 "separator: expected str instance,"
10112 " %.80s found",
10113 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010114 goto onError;
10115 }
10116 if (PyUnicode_READY(separator))
10117 goto onError;
10118 sep = separator;
10119 seplen = PyUnicode_GET_LENGTH(separator);
10120 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10121 /* inc refcount to keep this code path symmetric with the
10122 above case of a blank separator */
10123 Py_INCREF(sep);
10124 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010125 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010126 }
10127
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010128 /* There are at least two things to join, or else we have a subclass
10129 * of str in the sequence.
10130 * Do a pre-pass to figure out the total amount of space we'll
10131 * need (sz), and see whether all argument are strings.
10132 */
10133 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010134#ifdef Py_DEBUG
10135 use_memcpy = 0;
10136#else
10137 use_memcpy = 1;
10138#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010139 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010140 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010141 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010142 if (!PyUnicode_Check(item)) {
10143 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010144 "sequence item %zd: expected str instance,"
10145 " %.80s found",
10146 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010147 goto onError;
10148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (PyUnicode_READY(item) == -1)
10150 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010151 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010153 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010154 if (i != 0) {
10155 add_sz += seplen;
10156 }
10157 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010158 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010159 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010160 goto onError;
10161 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010162 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010163 if (use_memcpy && last_obj != NULL) {
10164 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10165 use_memcpy = 0;
10166 }
10167 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010168 }
Tim Petersced69f82003-09-16 20:30:58 +000010169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010171 if (res == NULL)
10172 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010173
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010174 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010175#ifdef Py_DEBUG
10176 use_memcpy = 0;
10177#else
10178 if (use_memcpy) {
10179 res_data = PyUnicode_1BYTE_DATA(res);
10180 kind = PyUnicode_KIND(res);
10181 if (seplen != 0)
10182 sep_data = PyUnicode_1BYTE_DATA(sep);
10183 }
10184#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010185 if (use_memcpy) {
10186 for (i = 0; i < seqlen; ++i) {
10187 Py_ssize_t itemlen;
10188 item = items[i];
10189
10190 /* Copy item, and maybe the separator. */
10191 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010192 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010193 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010194 kind * seplen);
10195 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010196 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010197
10198 itemlen = PyUnicode_GET_LENGTH(item);
10199 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010200 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010201 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010202 kind * itemlen);
10203 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010204 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010205 }
10206 assert(res_data == PyUnicode_1BYTE_DATA(res)
10207 + kind * PyUnicode_GET_LENGTH(res));
10208 }
10209 else {
10210 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10211 Py_ssize_t itemlen;
10212 item = items[i];
10213
10214 /* Copy item, and maybe the separator. */
10215 if (i && seplen != 0) {
10216 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10217 res_offset += seplen;
10218 }
10219
10220 itemlen = PyUnicode_GET_LENGTH(item);
10221 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010222 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010223 res_offset += itemlen;
10224 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010225 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010226 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010227 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010230 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
Benjamin Peterson29060642009-01-31 22:14:21 +000010233 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010235 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236 return NULL;
10237}
10238
Victor Stinnerd3f08822012-05-29 12:57:52 +020010239void
10240_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10241 Py_UCS4 fill_char)
10242{
10243 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010244 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010245 assert(PyUnicode_IS_READY(unicode));
10246 assert(unicode_modifiable(unicode));
10247 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10248 assert(start >= 0);
10249 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010250 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010251}
10252
Victor Stinner3fe55312012-01-04 00:33:50 +010010253Py_ssize_t
10254PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10255 Py_UCS4 fill_char)
10256{
10257 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010258
10259 if (!PyUnicode_Check(unicode)) {
10260 PyErr_BadInternalCall();
10261 return -1;
10262 }
10263 if (PyUnicode_READY(unicode) == -1)
10264 return -1;
10265 if (unicode_check_modifiable(unicode))
10266 return -1;
10267
Victor Stinnerd3f08822012-05-29 12:57:52 +020010268 if (start < 0) {
10269 PyErr_SetString(PyExc_IndexError, "string index out of range");
10270 return -1;
10271 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010272 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10273 PyErr_SetString(PyExc_ValueError,
10274 "fill character is bigger than "
10275 "the string maximum character");
10276 return -1;
10277 }
10278
10279 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10280 length = Py_MIN(maxlen, length);
10281 if (length <= 0)
10282 return 0;
10283
Victor Stinnerd3f08822012-05-29 12:57:52 +020010284 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010285 return length;
10286}
10287
Victor Stinner9310abb2011-10-05 00:59:23 +020010288static PyObject *
10289pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010290 Py_ssize_t left,
10291 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 PyObject *u;
10295 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010296 int kind;
10297 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298
10299 if (left < 0)
10300 left = 0;
10301 if (right < 0)
10302 right = 0;
10303
Victor Stinnerc4b49542011-12-11 22:44:26 +010010304 if (left == 0 && right == 0)
10305 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10308 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010309 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10310 return NULL;
10311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010313 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010315 if (!u)
10316 return NULL;
10317
10318 kind = PyUnicode_KIND(u);
10319 data = PyUnicode_DATA(u);
10320 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010321 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010322 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010323 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010324 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010325 assert(_PyUnicode_CheckConsistency(u, 1));
10326 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327}
10328
Alexander Belopolsky40018472011-02-26 01:02:56 +000010329PyObject *
10330PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010334 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336
Benjamin Petersonead6b532011-12-20 17:23:42 -060010337 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010339 if (PyUnicode_IS_ASCII(string))
10340 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010342 PyUnicode_GET_LENGTH(string), keepends);
10343 else
10344 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 break;
10348 case PyUnicode_2BYTE_KIND:
10349 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 PyUnicode_GET_LENGTH(string), keepends);
10352 break;
10353 case PyUnicode_4BYTE_KIND:
10354 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 PyUnicode_GET_LENGTH(string), keepends);
10357 break;
10358 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010359 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362}
10363
Alexander Belopolsky40018472011-02-26 01:02:56 +000010364static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010365split(PyObject *self,
10366 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010367 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010369 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010370 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 Py_ssize_t len1, len2;
10372 PyObject* out;
10373
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010375 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 if (PyUnicode_READY(self) == -1)
10378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010381 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010383 if (PyUnicode_IS_ASCII(self))
10384 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010385 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386 PyUnicode_GET_LENGTH(self), maxcount
10387 );
10388 else
10389 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 PyUnicode_GET_LENGTH(self), maxcount
10392 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 case PyUnicode_2BYTE_KIND:
10394 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 PyUnicode_GET_LENGTH(self), maxcount
10397 );
10398 case PyUnicode_4BYTE_KIND:
10399 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 PyUnicode_GET_LENGTH(self), maxcount
10402 );
10403 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010404 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 }
10406
10407 if (PyUnicode_READY(substring) == -1)
10408 return NULL;
10409
10410 kind1 = PyUnicode_KIND(self);
10411 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 len1 = PyUnicode_GET_LENGTH(self);
10413 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010414 if (kind1 < kind2 || len1 < len2) {
10415 out = PyList_New(1);
10416 if (out == NULL)
10417 return NULL;
10418 Py_INCREF(self);
10419 PyList_SET_ITEM(out, 0, self);
10420 return out;
10421 }
10422 buf1 = PyUnicode_DATA(self);
10423 buf2 = PyUnicode_DATA(substring);
10424 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010425 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010426 if (!buf2)
10427 return NULL;
10428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010430 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10433 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010434 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 else
10436 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010437 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 break;
10439 case PyUnicode_2BYTE_KIND:
10440 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010441 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 break;
10443 case PyUnicode_4BYTE_KIND:
10444 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010445 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 break;
10447 default:
10448 out = NULL;
10449 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010450 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010451 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010452 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454}
10455
Alexander Belopolsky40018472011-02-26 01:02:56 +000010456static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010457rsplit(PyObject *self,
10458 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010459 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010460{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010461 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010462 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 Py_ssize_t len1, len2;
10464 PyObject* out;
10465
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010466 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010467 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 if (PyUnicode_READY(self) == -1)
10470 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010473 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010475 if (PyUnicode_IS_ASCII(self))
10476 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010478 PyUnicode_GET_LENGTH(self), maxcount
10479 );
10480 else
10481 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010482 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010483 PyUnicode_GET_LENGTH(self), maxcount
10484 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 case PyUnicode_2BYTE_KIND:
10486 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010487 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 PyUnicode_GET_LENGTH(self), maxcount
10489 );
10490 case PyUnicode_4BYTE_KIND:
10491 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010492 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 PyUnicode_GET_LENGTH(self), maxcount
10494 );
10495 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010496 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 }
10498
10499 if (PyUnicode_READY(substring) == -1)
10500 return NULL;
10501
10502 kind1 = PyUnicode_KIND(self);
10503 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 len1 = PyUnicode_GET_LENGTH(self);
10505 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010506 if (kind1 < kind2 || len1 < len2) {
10507 out = PyList_New(1);
10508 if (out == NULL)
10509 return NULL;
10510 Py_INCREF(self);
10511 PyList_SET_ITEM(out, 0, self);
10512 return out;
10513 }
10514 buf1 = PyUnicode_DATA(self);
10515 buf2 = PyUnicode_DATA(substring);
10516 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010517 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010518 if (!buf2)
10519 return NULL;
10520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010522 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010524 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10525 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010526 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010527 else
10528 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010529 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 break;
10531 case PyUnicode_2BYTE_KIND:
10532 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010533 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 break;
10535 case PyUnicode_4BYTE_KIND:
10536 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010537 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 break;
10539 default:
10540 out = NULL;
10541 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010542 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010543 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010544 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 return out;
10546}
10547
10548static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010549anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10550 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010552 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010554 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10555 return asciilib_find(buf1, len1, buf2, len2, offset);
10556 else
10557 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 case PyUnicode_2BYTE_KIND:
10559 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10560 case PyUnicode_4BYTE_KIND:
10561 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10562 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010563 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564}
10565
10566static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010567anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10568 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010570 switch (kind) {
10571 case PyUnicode_1BYTE_KIND:
10572 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10573 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10574 else
10575 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10576 case PyUnicode_2BYTE_KIND:
10577 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10578 case PyUnicode_4BYTE_KIND:
10579 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10580 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010581 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010582}
10583
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010584static void
10585replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10586 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10587{
10588 int kind = PyUnicode_KIND(u);
10589 void *data = PyUnicode_DATA(u);
10590 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10591 if (kind == PyUnicode_1BYTE_KIND) {
10592 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10593 (Py_UCS1 *)data + len,
10594 u1, u2, maxcount);
10595 }
10596 else if (kind == PyUnicode_2BYTE_KIND) {
10597 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10598 (Py_UCS2 *)data + len,
10599 u1, u2, maxcount);
10600 }
10601 else {
10602 assert(kind == PyUnicode_4BYTE_KIND);
10603 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10604 (Py_UCS4 *)data + len,
10605 u1, u2, maxcount);
10606 }
10607}
10608
Alexander Belopolsky40018472011-02-26 01:02:56 +000010609static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610replace(PyObject *self, PyObject *str1,
10611 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010614 const char *sbuf = PyUnicode_DATA(self);
10615 const void *buf1 = PyUnicode_DATA(str1);
10616 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 int srelease = 0, release1 = 0, release2 = 0;
10618 int skind = PyUnicode_KIND(self);
10619 int kind1 = PyUnicode_KIND(str1);
10620 int kind2 = PyUnicode_KIND(str2);
10621 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10622 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10623 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010624 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010625 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010627 if (slen < len1)
10628 goto nothing;
10629
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010631 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010632 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010633 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634
Victor Stinner59de0ee2011-10-07 10:01:28 +020010635 if (str1 == str2)
10636 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010639 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10640 if (maxchar < maxchar_str1)
10641 /* substring too wide to be present */
10642 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10644 /* Replacing str1 with str2 may cause a maxchar reduction in the
10645 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010646 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010647 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010652 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010655 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010656 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010657
Victor Stinner69ed0f42013-04-09 21:48:24 +020010658 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010659 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010660 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010662 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010666
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010667 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10668 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010669 }
10670 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 int rkind = skind;
10672 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010673 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (kind1 < rkind) {
10676 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010677 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (!buf1) goto error;
10679 release1 = 1;
10680 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010681 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 if (i < 0)
10683 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (rkind > kind2) {
10685 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010686 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (!buf2) goto error;
10688 release2 = 1;
10689 }
10690 else if (rkind < kind2) {
10691 /* widen self and buf1 */
10692 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010693 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010694 assert(buf1 != PyUnicode_DATA(str1));
10695 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010696 buf1 = PyUnicode_DATA(str1);
10697 release1 = 0;
10698 }
10699 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (!sbuf) goto error;
10701 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010702 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 if (!buf1) goto error;
10704 release1 = 1;
10705 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010706 u = PyUnicode_New(slen, maxchar);
10707 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010709 assert(PyUnicode_KIND(u) == rkind);
10710 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010711
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010712 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010713 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010714 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010716 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010718
10719 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010720 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010721 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010722 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010723 if (i == -1)
10724 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010725 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010727 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010731 }
10732 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010734 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 int rkind = skind;
10736 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010739 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010740 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 if (!buf1) goto error;
10742 release1 = 1;
10743 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010744 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010745 if (n == 0)
10746 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010748 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010749 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (!buf2) goto error;
10751 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010754 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010756 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 if (!sbuf) goto error;
10758 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010759 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010760 assert(buf1 != PyUnicode_DATA(str1));
10761 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010762 buf1 = PyUnicode_DATA(str1);
10763 release1 = 0;
10764 }
10765 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 if (!buf1) goto error;
10767 release1 = 1;
10768 }
10769 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10770 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010771 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 PyErr_SetString(PyExc_OverflowError,
10773 "replace string is too long");
10774 goto error;
10775 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010776 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010777 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010778 _Py_INCREF_UNICODE_EMPTY();
10779 if (!unicode_empty)
10780 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010781 u = unicode_empty;
10782 goto done;
10783 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010784 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 PyErr_SetString(PyExc_OverflowError,
10786 "replace string is too long");
10787 goto error;
10788 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010789 u = PyUnicode_New(new_size, maxchar);
10790 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 assert(PyUnicode_KIND(u) == rkind);
10793 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 ires = i = 0;
10795 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010796 while (n-- > 0) {
10797 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010798 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010799 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010800 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010801 if (j == -1)
10802 break;
10803 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010804 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010805 memcpy(res + rkind * ires,
10806 sbuf + rkind * i,
10807 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010809 }
10810 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010812 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010814 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010820 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010821 memcpy(res + rkind * ires,
10822 sbuf + rkind * i,
10823 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010824 }
10825 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010826 /* interleave */
10827 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010828 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010830 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010832 if (--n <= 0)
10833 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010834 memcpy(res + rkind * ires,
10835 sbuf + rkind * i,
10836 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 ires++;
10838 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010839 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010840 memcpy(res + rkind * ires,
10841 sbuf + rkind * i,
10842 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010843 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010844 }
10845
10846 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010847 unicode_adjust_maxchar(&u);
10848 if (u == NULL)
10849 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010851
10852 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010853 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10854 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10855 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010857 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010859 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010861 PyMem_FREE((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010862 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010864
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010866 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010867 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10868 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10869 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 if (srelease)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010871 PyMem_FREE((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 if (release1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010873 PyMem_FREE((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 if (release2)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010875 PyMem_FREE((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010876 return unicode_result_unchanged(self);
10877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010879 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10880 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10881 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10882 if (srelease)
10883 PyMem_FREE((void *)sbuf);
10884 if (release1)
10885 PyMem_FREE((void *)buf1);
10886 if (release2)
10887 PyMem_FREE((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889}
10890
10891/* --- Unicode Object Methods --------------------------------------------- */
10892
INADA Naoki3ae20562017-01-16 20:41:20 +090010893/*[clinic input]
10894str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895
INADA Naoki3ae20562017-01-16 20:41:20 +090010896Return a version of the string where each word is titlecased.
10897
10898More specifically, words start with uppercased characters and all remaining
10899cased characters have lower case.
10900[clinic start generated code]*/
10901
10902static PyObject *
10903unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010904/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010906 if (PyUnicode_READY(self) == -1)
10907 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010908 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909}
10910
INADA Naoki3ae20562017-01-16 20:41:20 +090010911/*[clinic input]
10912str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
INADA Naoki3ae20562017-01-16 20:41:20 +090010914Return a capitalized version of the string.
10915
10916More specifically, make the first character have upper case and the rest lower
10917case.
10918[clinic start generated code]*/
10919
10920static PyObject *
10921unicode_capitalize_impl(PyObject *self)
10922/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010924 if (PyUnicode_READY(self) == -1)
10925 return NULL;
10926 if (PyUnicode_GET_LENGTH(self) == 0)
10927 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010928 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929}
10930
INADA Naoki3ae20562017-01-16 20:41:20 +090010931/*[clinic input]
10932str.casefold as unicode_casefold
10933
10934Return a version of the string suitable for caseless comparisons.
10935[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010936
10937static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010938unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010939/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010940{
10941 if (PyUnicode_READY(self) == -1)
10942 return NULL;
10943 if (PyUnicode_IS_ASCII(self))
10944 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010945 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010946}
10947
10948
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010949/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010950
10951static int
10952convert_uc(PyObject *obj, void *addr)
10953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010955
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010956 if (!PyUnicode_Check(obj)) {
10957 PyErr_Format(PyExc_TypeError,
10958 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010959 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010960 return 0;
10961 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010962 if (PyUnicode_READY(obj) < 0)
10963 return 0;
10964 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010965 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010966 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010967 return 0;
10968 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010969 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010970 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010971}
10972
INADA Naoki3ae20562017-01-16 20:41:20 +090010973/*[clinic input]
10974str.center as unicode_center
10975
10976 width: Py_ssize_t
10977 fillchar: Py_UCS4 = ' '
10978 /
10979
10980Return a centered string of length width.
10981
10982Padding is done using the specified fill character (default is a space).
10983[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984
10985static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010986unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10987/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010989 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990
Benjamin Petersonbac79492012-01-14 13:34:47 -050010991 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 return NULL;
10993
Victor Stinnerc4b49542011-12-11 22:44:26 +010010994 if (PyUnicode_GET_LENGTH(self) >= width)
10995 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
Victor Stinnerc4b49542011-12-11 22:44:26 +010010997 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 left = marg / 2 + (marg & width & 1);
10999
Victor Stinner9310abb2011-10-05 00:59:23 +020011000 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001}
11002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003/* This function assumes that str1 and str2 are readied by the caller. */
11004
Marc-André Lemburge5034372000-08-08 08:04:29 +000011005static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011006unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011007{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011008#define COMPARE(TYPE1, TYPE2) \
11009 do { \
11010 TYPE1* p1 = (TYPE1 *)data1; \
11011 TYPE2* p2 = (TYPE2 *)data2; \
11012 TYPE1* end = p1 + len; \
11013 Py_UCS4 c1, c2; \
11014 for (; p1 != end; p1++, p2++) { \
11015 c1 = *p1; \
11016 c2 = *p2; \
11017 if (c1 != c2) \
11018 return (c1 < c2) ? -1 : 1; \
11019 } \
11020 } \
11021 while (0)
11022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011024 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011025 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 kind1 = PyUnicode_KIND(str1);
11028 kind2 = PyUnicode_KIND(str2);
11029 data1 = PyUnicode_DATA(str1);
11030 data2 = PyUnicode_DATA(str2);
11031 len1 = PyUnicode_GET_LENGTH(str1);
11032 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011033 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011034
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011035 switch(kind1) {
11036 case PyUnicode_1BYTE_KIND:
11037 {
11038 switch(kind2) {
11039 case PyUnicode_1BYTE_KIND:
11040 {
11041 int cmp = memcmp(data1, data2, len);
11042 /* normalize result of memcmp() into the range [-1; 1] */
11043 if (cmp < 0)
11044 return -1;
11045 if (cmp > 0)
11046 return 1;
11047 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011048 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011049 case PyUnicode_2BYTE_KIND:
11050 COMPARE(Py_UCS1, Py_UCS2);
11051 break;
11052 case PyUnicode_4BYTE_KIND:
11053 COMPARE(Py_UCS1, Py_UCS4);
11054 break;
11055 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011056 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011057 }
11058 break;
11059 }
11060 case PyUnicode_2BYTE_KIND:
11061 {
11062 switch(kind2) {
11063 case PyUnicode_1BYTE_KIND:
11064 COMPARE(Py_UCS2, Py_UCS1);
11065 break;
11066 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011067 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011068 COMPARE(Py_UCS2, Py_UCS2);
11069 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011070 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011071 case PyUnicode_4BYTE_KIND:
11072 COMPARE(Py_UCS2, Py_UCS4);
11073 break;
11074 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011075 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011076 }
11077 break;
11078 }
11079 case PyUnicode_4BYTE_KIND:
11080 {
11081 switch(kind2) {
11082 case PyUnicode_1BYTE_KIND:
11083 COMPARE(Py_UCS4, Py_UCS1);
11084 break;
11085 case PyUnicode_2BYTE_KIND:
11086 COMPARE(Py_UCS4, Py_UCS2);
11087 break;
11088 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011089 {
11090#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11091 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11092 /* normalize result of wmemcmp() into the range [-1; 1] */
11093 if (cmp < 0)
11094 return -1;
11095 if (cmp > 0)
11096 return 1;
11097#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011098 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011099#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011100 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011101 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011102 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011103 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011104 }
11105 break;
11106 }
11107 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011108 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011109 }
11110
Victor Stinner770e19e2012-10-04 22:59:45 +020011111 if (len1 == len2)
11112 return 0;
11113 if (len1 < len2)
11114 return -1;
11115 else
11116 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011117
11118#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011119}
11120
Benjamin Peterson621b4302016-09-09 13:54:34 -070011121static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011122unicode_compare_eq(PyObject *str1, PyObject *str2)
11123{
11124 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011125 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011126 Py_ssize_t len;
11127 int cmp;
11128
Victor Stinnere5567ad2012-10-23 02:48:49 +020011129 len = PyUnicode_GET_LENGTH(str1);
11130 if (PyUnicode_GET_LENGTH(str2) != len)
11131 return 0;
11132 kind = PyUnicode_KIND(str1);
11133 if (PyUnicode_KIND(str2) != kind)
11134 return 0;
11135 data1 = PyUnicode_DATA(str1);
11136 data2 = PyUnicode_DATA(str2);
11137
11138 cmp = memcmp(data1, data2, len * kind);
11139 return (cmp == 0);
11140}
11141
11142
Alexander Belopolsky40018472011-02-26 01:02:56 +000011143int
11144PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11147 if (PyUnicode_READY(left) == -1 ||
11148 PyUnicode_READY(right) == -1)
11149 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011150
11151 /* a string is equal to itself */
11152 if (left == right)
11153 return 0;
11154
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011155 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011157 PyErr_Format(PyExc_TypeError,
11158 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011159 Py_TYPE(left)->tp_name,
11160 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161 return -1;
11162}
11163
Martin v. Löwis5b222132007-06-10 09:51:05 +000011164int
11165PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 Py_ssize_t i;
11168 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011170 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171
Victor Stinner910337b2011-10-03 03:20:16 +020011172 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011173 if (!PyUnicode_IS_READY(uni)) {
11174 const wchar_t *ws = _PyUnicode_WSTR(uni);
11175 /* Compare Unicode string and source character set string */
11176 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11177 if (chr != ustr[i])
11178 return (chr < ustr[i]) ? -1 : 1;
11179 }
11180 /* This check keeps Python strings that end in '\0' from comparing equal
11181 to C strings identical up to that point. */
11182 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11183 return 1; /* uni is longer */
11184 if (ustr[i])
11185 return -1; /* str is longer */
11186 return 0;
11187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011189 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011190 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011191 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011192 size_t len, len2 = strlen(str);
11193 int cmp;
11194
11195 len = Py_MIN(len1, len2);
11196 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011197 if (cmp != 0) {
11198 if (cmp < 0)
11199 return -1;
11200 else
11201 return 1;
11202 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011203 if (len1 > len2)
11204 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011205 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011206 return -1; /* str is longer */
11207 return 0;
11208 }
11209 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011210 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011211 /* Compare Unicode string and source character set string */
11212 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011213 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011214 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11215 /* This check keeps Python strings that end in '\0' from comparing equal
11216 to C strings identical up to that point. */
11217 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11218 return 1; /* uni is longer */
11219 if (str[i])
11220 return -1; /* str is longer */
11221 return 0;
11222 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011223}
11224
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011225static int
11226non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11227{
11228 size_t i, len;
11229 const wchar_t *p;
11230 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11231 if (strlen(str) != len)
11232 return 0;
11233 p = _PyUnicode_WSTR(unicode);
11234 assert(p);
11235 for (i = 0; i < len; i++) {
11236 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011237 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011238 return 0;
11239 }
11240 return 1;
11241}
11242
11243int
11244_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11245{
11246 size_t len;
11247 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011248 assert(str);
11249#ifndef NDEBUG
11250 for (const char *p = str; *p; p++) {
11251 assert((unsigned char)*p < 128);
11252 }
11253#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011254 if (PyUnicode_READY(unicode) == -1) {
11255 /* Memory error or bad data */
11256 PyErr_Clear();
11257 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11258 }
11259 if (!PyUnicode_IS_ASCII(unicode))
11260 return 0;
11261 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11262 return strlen(str) == len &&
11263 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11264}
11265
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011266int
11267_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11268{
11269 PyObject *right_uni;
11270 Py_hash_t hash;
11271
11272 assert(_PyUnicode_CHECK(left));
11273 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011274#ifndef NDEBUG
11275 for (const char *p = right->string; *p; p++) {
11276 assert((unsigned char)*p < 128);
11277 }
11278#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011279
11280 if (PyUnicode_READY(left) == -1) {
11281 /* memory error or bad data */
11282 PyErr_Clear();
11283 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11284 }
11285
11286 if (!PyUnicode_IS_ASCII(left))
11287 return 0;
11288
11289 right_uni = _PyUnicode_FromId(right); /* borrowed */
11290 if (right_uni == NULL) {
11291 /* memory error or bad data */
11292 PyErr_Clear();
11293 return _PyUnicode_EqualToASCIIString(left, right->string);
11294 }
11295
11296 if (left == right_uni)
11297 return 1;
11298
11299 if (PyUnicode_CHECK_INTERNED(left))
11300 return 0;
11301
INADA Naoki7cc95f52018-01-28 02:07:09 +090011302 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011303 hash = _PyUnicode_HASH(left);
11304 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11305 return 0;
11306
11307 return unicode_compare_eq(left, right_uni);
11308}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011309
Alexander Belopolsky40018472011-02-26 01:02:56 +000011310PyObject *
11311PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011312{
11313 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011314
Victor Stinnere5567ad2012-10-23 02:48:49 +020011315 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11316 Py_RETURN_NOTIMPLEMENTED;
11317
11318 if (PyUnicode_READY(left) == -1 ||
11319 PyUnicode_READY(right) == -1)
11320 return NULL;
11321
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011322 if (left == right) {
11323 switch (op) {
11324 case Py_EQ:
11325 case Py_LE:
11326 case Py_GE:
11327 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011328 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011329 case Py_NE:
11330 case Py_LT:
11331 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011332 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011333 default:
11334 PyErr_BadArgument();
11335 return NULL;
11336 }
11337 }
11338 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011339 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011340 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011341 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011342 }
11343 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011344 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011345 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011346 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011347}
11348
Alexander Belopolsky40018472011-02-26 01:02:56 +000011349int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011350_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11351{
11352 return unicode_eq(aa, bb);
11353}
11354
11355int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011356PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011357{
Victor Stinner77282cb2013-04-14 19:22:47 +020011358 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011359 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011361 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011362
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011363 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011365 "'in <string>' requires string as left operand, not %.100s",
11366 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011367 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011368 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011369 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011370 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011371 if (ensure_unicode(str) < 0)
11372 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375 kind2 = PyUnicode_KIND(substr);
11376 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011377 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011379 len2 = PyUnicode_GET_LENGTH(substr);
11380 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011381 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011382 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011383 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011384 if (len2 == 1) {
11385 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11386 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011387 return result;
11388 }
11389 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011390 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011391 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011392 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394
Victor Stinner77282cb2013-04-14 19:22:47 +020011395 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 case PyUnicode_1BYTE_KIND:
11397 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11398 break;
11399 case PyUnicode_2BYTE_KIND:
11400 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11401 break;
11402 case PyUnicode_4BYTE_KIND:
11403 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11404 break;
11405 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011406 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011408
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011409 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011410 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011411 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412
Guido van Rossum403d68b2000-03-13 15:55:09 +000011413 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011414}
11415
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416/* Concat to string or Unicode object giving a new Unicode object. */
11417
Alexander Belopolsky40018472011-02-26 01:02:56 +000011418PyObject *
11419PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011421 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011422 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011423 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011425 if (ensure_unicode(left) < 0)
11426 return NULL;
11427
11428 if (!PyUnicode_Check(right)) {
11429 PyErr_Format(PyExc_TypeError,
11430 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011431 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011432 return NULL;
11433 }
11434 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436
11437 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011438 if (left == unicode_empty)
11439 return PyUnicode_FromObject(right);
11440 if (right == unicode_empty)
11441 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011443 left_len = PyUnicode_GET_LENGTH(left);
11444 right_len = PyUnicode_GET_LENGTH(right);
11445 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011446 PyErr_SetString(PyExc_OverflowError,
11447 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011448 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011449 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011450 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011451
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011452 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11453 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011454 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011457 result = PyUnicode_New(new_len, maxchar);
11458 if (result == NULL)
11459 return NULL;
11460 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11461 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11462 assert(_PyUnicode_CheckConsistency(result, 1));
11463 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464}
11465
Walter Dörwald1ab83302007-05-18 17:15:44 +000011466void
Victor Stinner23e56682011-10-03 03:54:37 +020011467PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011468{
Victor Stinner23e56682011-10-03 03:54:37 +020011469 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011470 Py_UCS4 maxchar, maxchar2;
11471 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011472
11473 if (p_left == NULL) {
11474 if (!PyErr_Occurred())
11475 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011476 return;
11477 }
Victor Stinner23e56682011-10-03 03:54:37 +020011478 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011479 if (right == NULL || left == NULL
11480 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011481 if (!PyErr_Occurred())
11482 PyErr_BadInternalCall();
11483 goto error;
11484 }
11485
Benjamin Petersonbac79492012-01-14 13:34:47 -050011486 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011487 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011488 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011489 goto error;
11490
Victor Stinner488fa492011-12-12 00:01:39 +010011491 /* Shortcuts */
11492 if (left == unicode_empty) {
11493 Py_DECREF(left);
11494 Py_INCREF(right);
11495 *p_left = right;
11496 return;
11497 }
11498 if (right == unicode_empty)
11499 return;
11500
11501 left_len = PyUnicode_GET_LENGTH(left);
11502 right_len = PyUnicode_GET_LENGTH(right);
11503 if (left_len > PY_SSIZE_T_MAX - right_len) {
11504 PyErr_SetString(PyExc_OverflowError,
11505 "strings are too large to concat");
11506 goto error;
11507 }
11508 new_len = left_len + right_len;
11509
11510 if (unicode_modifiable(left)
11511 && PyUnicode_CheckExact(right)
11512 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011513 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11514 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011515 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011516 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011517 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11518 {
11519 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011520 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011521 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011522
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011523 /* copy 'right' into the newly allocated area of 'left' */
11524 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011525 }
Victor Stinner488fa492011-12-12 00:01:39 +010011526 else {
11527 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11528 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011529 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011530
Victor Stinner488fa492011-12-12 00:01:39 +010011531 /* Concat the two Unicode strings */
11532 res = PyUnicode_New(new_len, maxchar);
11533 if (res == NULL)
11534 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011535 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11536 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011537 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011538 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011539 }
11540 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011541 return;
11542
11543error:
Victor Stinner488fa492011-12-12 00:01:39 +010011544 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011545}
11546
11547void
11548PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11549{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011550 PyUnicode_Append(pleft, right);
11551 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011552}
11553
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011554/*
11555Wraps stringlib_parse_args_finds() and additionally ensures that the
11556first argument is a unicode object.
11557*/
11558
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011559static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011560parse_args_finds_unicode(const char * function_name, PyObject *args,
11561 PyObject **substring,
11562 Py_ssize_t *start, Py_ssize_t *end)
11563{
11564 if(stringlib_parse_args_finds(function_name, args, substring,
11565 start, end)) {
11566 if (ensure_unicode(*substring) < 0)
11567 return 0;
11568 return 1;
11569 }
11570 return 0;
11571}
11572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011573PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011576Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011577string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011578interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579
11580static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011581unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011583 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011584 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011585 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011587 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011588 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011591 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011592 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 kind1 = PyUnicode_KIND(self);
11595 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011596 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011597 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 len1 = PyUnicode_GET_LENGTH(self);
11600 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011602 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011603 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011604
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011605 buf1 = PyUnicode_DATA(self);
11606 buf2 = PyUnicode_DATA(substring);
11607 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011608 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011609 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011610 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011611 }
11612 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 case PyUnicode_1BYTE_KIND:
11614 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011615 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 buf2, len2, PY_SSIZE_T_MAX
11617 );
11618 break;
11619 case PyUnicode_2BYTE_KIND:
11620 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011621 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 buf2, len2, PY_SSIZE_T_MAX
11623 );
11624 break;
11625 case PyUnicode_4BYTE_KIND:
11626 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011627 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 buf2, len2, PY_SSIZE_T_MAX
11629 );
11630 break;
11631 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011632 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 }
11634
11635 result = PyLong_FromSsize_t(iresult);
11636
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011637 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011638 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011639 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641 return result;
11642}
11643
INADA Naoki3ae20562017-01-16 20:41:20 +090011644/*[clinic input]
11645str.encode as unicode_encode
11646
11647 encoding: str(c_default="NULL") = 'utf-8'
11648 The encoding in which to encode the string.
11649 errors: str(c_default="NULL") = 'strict'
11650 The error handling scheme to use for encoding errors.
11651 The default is 'strict' meaning that encoding errors raise a
11652 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11653 'xmlcharrefreplace' as well as any other name registered with
11654 codecs.register_error that can handle UnicodeEncodeErrors.
11655
11656Encode the string using the codec registered for encoding.
11657[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
11659static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011660unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011661/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011664}
11665
INADA Naoki3ae20562017-01-16 20:41:20 +090011666/*[clinic input]
11667str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668
INADA Naoki3ae20562017-01-16 20:41:20 +090011669 tabsize: int = 8
11670
11671Return a copy where all tab characters are expanded using spaces.
11672
11673If tabsize is not given, a tab size of 8 characters is assumed.
11674[clinic start generated code]*/
11675
11676static PyObject *
11677unicode_expandtabs_impl(PyObject *self, int tabsize)
11678/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011680 Py_ssize_t i, j, line_pos, src_len, incr;
11681 Py_UCS4 ch;
11682 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011683 const void *src_data;
11684 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011685 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011686 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
Antoine Pitrou22425222011-10-04 19:10:51 +020011688 if (PyUnicode_READY(self) == -1)
11689 return NULL;
11690
Thomas Wouters7e474022000-07-16 12:04:32 +000011691 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011692 src_len = PyUnicode_GET_LENGTH(self);
11693 i = j = line_pos = 0;
11694 kind = PyUnicode_KIND(self);
11695 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011696 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011697 for (; i < src_len; i++) {
11698 ch = PyUnicode_READ(kind, src_data, i);
11699 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011700 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011702 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011704 goto overflow;
11705 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011707 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011711 goto overflow;
11712 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011714 if (ch == '\n' || ch == '\r')
11715 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011717 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011718 if (!found)
11719 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011720
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011722 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723 if (!u)
11724 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011725 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
Antoine Pitroue71d5742011-10-04 15:55:09 +020011727 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
Antoine Pitroue71d5742011-10-04 15:55:09 +020011729 for (; i < src_len; i++) {
11730 ch = PyUnicode_READ(kind, src_data, i);
11731 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011733 incr = tabsize - (line_pos % tabsize);
11734 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011735 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011736 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011737 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011738 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011740 line_pos++;
11741 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011742 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011743 if (ch == '\n' || ch == '\r')
11744 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011746 }
11747 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011748 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011749
Antoine Pitroue71d5742011-10-04 15:55:09 +020011750 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011751 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753}
11754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011755PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757\n\
11758Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011759such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760arguments start and end are interpreted as in slice notation.\n\
11761\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
11764static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011767 /* initialize variables to prevent gcc warning */
11768 PyObject *substring = NULL;
11769 Py_ssize_t start = 0;
11770 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011771 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011773 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011776 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011779 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (result == -2)
11782 return NULL;
11783
Christian Heimes217cfd12007-12-02 14:31:20 +000011784 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785}
11786
11787static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011788unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011790 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011791 enum PyUnicode_Kind kind;
11792 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011793
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011794 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011795 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011797 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011798 if (PyUnicode_READY(self) == -1) {
11799 return NULL;
11800 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011801 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11802 PyErr_SetString(PyExc_IndexError, "string index out of range");
11803 return NULL;
11804 }
11805 kind = PyUnicode_KIND(self);
11806 data = PyUnicode_DATA(self);
11807 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011808 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809}
11810
Guido van Rossumc2504932007-09-18 19:42:40 +000011811/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011812 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011813static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011814unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011816 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011817
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011818#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011819 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011820#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (_PyUnicode_HASH(self) != -1)
11822 return _PyUnicode_HASH(self);
11823 if (PyUnicode_READY(self) == -1)
11824 return -1;
animalizea1d14252019-01-02 20:16:06 +080011825
Christian Heimes985ecdc2013-11-20 11:46:18 +010011826 x = _Py_HashBytes(PyUnicode_DATA(self),
11827 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011829 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830}
11831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834\n\
oldkaa0735f2018-02-02 16:52:55 +080011835Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011836such that sub is contained within S[start:end]. Optional\n\
11837arguments start and end are interpreted as in slice notation.\n\
11838\n\
11839Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840
11841static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011844 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011845 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011846 PyObject *substring = NULL;
11847 Py_ssize_t start = 0;
11848 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011850 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011853 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011856 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 if (result == -2)
11859 return NULL;
11860
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 if (result < 0) {
11862 PyErr_SetString(PyExc_ValueError, "substring not found");
11863 return NULL;
11864 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011865
Christian Heimes217cfd12007-12-02 14:31:20 +000011866 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867}
11868
INADA Naoki3ae20562017-01-16 20:41:20 +090011869/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011870str.isascii as unicode_isascii
11871
11872Return True if all characters in the string are ASCII, False otherwise.
11873
11874ASCII characters have code points in the range U+0000-U+007F.
11875Empty string is ASCII too.
11876[clinic start generated code]*/
11877
11878static PyObject *
11879unicode_isascii_impl(PyObject *self)
11880/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11881{
11882 if (PyUnicode_READY(self) == -1) {
11883 return NULL;
11884 }
11885 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11886}
11887
11888/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011889str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
INADA Naoki3ae20562017-01-16 20:41:20 +090011891Return True if the string is a lowercase string, False otherwise.
11892
11893A string is lowercase if all cased characters in the string are lowercase and
11894there is at least one cased character in the string.
11895[clinic start generated code]*/
11896
11897static PyObject *
11898unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011899/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 Py_ssize_t i, length;
11902 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011903 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 int cased;
11905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 if (PyUnicode_READY(self) == -1)
11907 return NULL;
11908 length = PyUnicode_GET_LENGTH(self);
11909 kind = PyUnicode_KIND(self);
11910 data = PyUnicode_DATA(self);
11911
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if (length == 1)
11914 return PyBool_FromLong(
11915 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011917 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011919 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011920
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 for (i = 0; i < length; i++) {
11923 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011924
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011926 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011927 else if (!cased && Py_UNICODE_ISLOWER(ch))
11928 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011930 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931}
11932
INADA Naoki3ae20562017-01-16 20:41:20 +090011933/*[clinic input]
11934str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
INADA Naoki3ae20562017-01-16 20:41:20 +090011936Return True if the string is an uppercase string, False otherwise.
11937
11938A string is uppercase if all cased characters in the string are uppercase and
11939there is at least one cased character in the string.
11940[clinic start generated code]*/
11941
11942static PyObject *
11943unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011944/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 Py_ssize_t i, length;
11947 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011948 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 int cased;
11950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (PyUnicode_READY(self) == -1)
11952 return NULL;
11953 length = PyUnicode_GET_LENGTH(self);
11954 kind = PyUnicode_KIND(self);
11955 data = PyUnicode_DATA(self);
11956
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if (length == 1)
11959 return PyBool_FromLong(
11960 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011962 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011964 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011965
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 for (i = 0; i < length; i++) {
11968 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011969
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011971 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 else if (!cased && Py_UNICODE_ISUPPER(ch))
11973 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011975 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976}
11977
INADA Naoki3ae20562017-01-16 20:41:20 +090011978/*[clinic input]
11979str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
INADA Naoki3ae20562017-01-16 20:41:20 +090011981Return True if the string is a title-cased string, False otherwise.
11982
11983In a title-cased string, upper- and title-case characters may only
11984follow uncased characters and lowercase characters only cased ones.
11985[clinic start generated code]*/
11986
11987static PyObject *
11988unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011989/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 Py_ssize_t i, length;
11992 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011993 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994 int cased, previous_is_cased;
11995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (PyUnicode_READY(self) == -1)
11997 return NULL;
11998 length = PyUnicode_GET_LENGTH(self);
11999 kind = PyUnicode_KIND(self);
12000 data = PyUnicode_DATA(self);
12001
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (length == 1) {
12004 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12005 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12006 (Py_UNICODE_ISUPPER(ch) != 0));
12007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012009 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012011 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012012
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 cased = 0;
12014 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 for (i = 0; i < length; i++) {
12016 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012017
Benjamin Peterson29060642009-01-31 22:14:21 +000012018 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12019 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012020 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012021 previous_is_cased = 1;
12022 cased = 1;
12023 }
12024 else if (Py_UNICODE_ISLOWER(ch)) {
12025 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012026 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 previous_is_cased = 1;
12028 cased = 1;
12029 }
12030 else
12031 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012033 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034}
12035
INADA Naoki3ae20562017-01-16 20:41:20 +090012036/*[clinic input]
12037str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038
INADA Naoki3ae20562017-01-16 20:41:20 +090012039Return True if the string is a whitespace string, False otherwise.
12040
12041A string is whitespace if all characters in the string are whitespace and there
12042is at least one character in the string.
12043[clinic start generated code]*/
12044
12045static PyObject *
12046unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012047/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 Py_ssize_t i, length;
12050 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012051 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052
12053 if (PyUnicode_READY(self) == -1)
12054 return NULL;
12055 length = PyUnicode_GET_LENGTH(self);
12056 kind = PyUnicode_KIND(self);
12057 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (length == 1)
12061 return PyBool_FromLong(
12062 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012064 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012066 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 for (i = 0; i < length; i++) {
12069 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012070 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012071 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012073 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074}
12075
INADA Naoki3ae20562017-01-16 20:41:20 +090012076/*[clinic input]
12077str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012078
INADA Naoki3ae20562017-01-16 20:41:20 +090012079Return True if the string is an alphabetic string, False otherwise.
12080
12081A string is alphabetic if all characters in the string are alphabetic and there
12082is at least one character in the string.
12083[clinic start generated code]*/
12084
12085static PyObject *
12086unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012087/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012088{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 Py_ssize_t i, length;
12090 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012091 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092
12093 if (PyUnicode_READY(self) == -1)
12094 return NULL;
12095 length = PyUnicode_GET_LENGTH(self);
12096 kind = PyUnicode_KIND(self);
12097 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012098
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012099 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (length == 1)
12101 return PyBool_FromLong(
12102 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012103
12104 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012106 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 for (i = 0; i < length; i++) {
12109 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012110 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012111 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012112 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012113}
12114
INADA Naoki3ae20562017-01-16 20:41:20 +090012115/*[clinic input]
12116str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012117
INADA Naoki3ae20562017-01-16 20:41:20 +090012118Return True if the string is an alpha-numeric string, False otherwise.
12119
12120A string is alpha-numeric if all characters in the string are alpha-numeric and
12121there is at least one character in the string.
12122[clinic start generated code]*/
12123
12124static PyObject *
12125unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012126/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012129 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 Py_ssize_t len, i;
12131
12132 if (PyUnicode_READY(self) == -1)
12133 return NULL;
12134
12135 kind = PyUnicode_KIND(self);
12136 data = PyUnicode_DATA(self);
12137 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012138
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012139 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (len == 1) {
12141 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12142 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12143 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012144
12145 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012147 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 for (i = 0; i < len; i++) {
12150 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012151 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012152 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012153 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012154 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012155}
12156
INADA Naoki3ae20562017-01-16 20:41:20 +090012157/*[clinic input]
12158str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160Return True if the string is a decimal string, False otherwise.
12161
12162A string is a decimal string if all characters in the string are decimal and
12163there is at least one character in the string.
12164[clinic start generated code]*/
12165
12166static PyObject *
12167unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012168/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 Py_ssize_t i, length;
12171 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012172 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173
12174 if (PyUnicode_READY(self) == -1)
12175 return NULL;
12176 length = PyUnicode_GET_LENGTH(self);
12177 kind = PyUnicode_KIND(self);
12178 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (length == 1)
12182 return PyBool_FromLong(
12183 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012185 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012187 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 for (i = 0; i < length; i++) {
12190 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012191 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012193 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194}
12195
INADA Naoki3ae20562017-01-16 20:41:20 +090012196/*[clinic input]
12197str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
INADA Naoki3ae20562017-01-16 20:41:20 +090012199Return True if the string is a digit string, False otherwise.
12200
12201A string is a digit string if all characters in the string are digits and there
12202is at least one character in the string.
12203[clinic start generated code]*/
12204
12205static PyObject *
12206unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012207/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 Py_ssize_t i, length;
12210 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012211 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212
12213 if (PyUnicode_READY(self) == -1)
12214 return NULL;
12215 length = PyUnicode_GET_LENGTH(self);
12216 kind = PyUnicode_KIND(self);
12217 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 if (length == 1) {
12221 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12222 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012225 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012227 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 for (i = 0; i < length; i++) {
12230 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012231 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012233 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234}
12235
INADA Naoki3ae20562017-01-16 20:41:20 +090012236/*[clinic input]
12237str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238
INADA Naoki3ae20562017-01-16 20:41:20 +090012239Return True if the string is a numeric string, False otherwise.
12240
12241A string is numeric if all characters in the string are numeric and there is at
12242least one character in the string.
12243[clinic start generated code]*/
12244
12245static PyObject *
12246unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012247/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 Py_ssize_t i, length;
12250 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012251 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252
12253 if (PyUnicode_READY(self) == -1)
12254 return NULL;
12255 length = PyUnicode_GET_LENGTH(self);
12256 kind = PyUnicode_KIND(self);
12257 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 if (length == 1)
12261 return PyBool_FromLong(
12262 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012264 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012266 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 for (i = 0; i < length; i++) {
12269 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012270 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012272 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273}
12274
Martin v. Löwis47383402007-08-15 07:32:56 +000012275int
12276PyUnicode_IsIdentifier(PyObject *self)
12277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012279 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012280
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012281 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12282 if (len == 0) {
12283 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 }
12286
Hai Shi3d235f52020-02-17 21:41:15 +080012287 int kind = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012288 const void *data = NULL;
Andy Lester933fc53f2020-02-20 22:51:47 -060012289 const wchar_t *wstr = NULL;
12290 Py_UCS4 ch;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012291 if (ready) {
12292 kind = PyUnicode_KIND(self);
12293 data = PyUnicode_DATA(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012294 ch = PyUnicode_READ(kind, data, 0);
12295 }
12296 else {
Andy Lester933fc53f2020-02-20 22:51:47 -060012297 wstr = _PyUnicode_WSTR(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012298 ch = wstr[0];
12299 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012300 /* PEP 3131 says that the first character must be in
12301 XID_Start and subsequent characters in XID_Continue,
12302 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012303 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012304 letters, digits, underscore). However, given the current
12305 definition of XID_Start and XID_Continue, it is sufficient
12306 to check just for these, except that _ must be allowed
12307 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012308 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012309 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012310 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012311
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012312 for (i = 1; i < len; i++) {
12313 if (ready) {
12314 ch = PyUnicode_READ(kind, data, i);
12315 }
12316 else {
12317 ch = wstr[i];
12318 }
12319 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012321 }
12322 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012323 return 1;
12324}
12325
INADA Naoki3ae20562017-01-16 20:41:20 +090012326/*[clinic input]
12327str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012328
INADA Naoki3ae20562017-01-16 20:41:20 +090012329Return True if the string is a valid Python identifier, False otherwise.
12330
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012331Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012332such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012333[clinic start generated code]*/
12334
12335static PyObject *
12336unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012337/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012338{
12339 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12340}
12341
INADA Naoki3ae20562017-01-16 20:41:20 +090012342/*[clinic input]
12343str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012344
INADA Naoki3ae20562017-01-16 20:41:20 +090012345Return True if the string is printable, False otherwise.
12346
12347A string is printable if all of its characters are considered printable in
12348repr() or if it is empty.
12349[clinic start generated code]*/
12350
12351static PyObject *
12352unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012353/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 Py_ssize_t i, length;
12356 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012357 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358
12359 if (PyUnicode_READY(self) == -1)
12360 return NULL;
12361 length = PyUnicode_GET_LENGTH(self);
12362 kind = PyUnicode_KIND(self);
12363 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012364
12365 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 if (length == 1)
12367 return PyBool_FromLong(
12368 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 for (i = 0; i < length; i++) {
12371 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012372 Py_RETURN_FALSE;
12373 }
12374 }
12375 Py_RETURN_TRUE;
12376}
12377
INADA Naoki3ae20562017-01-16 20:41:20 +090012378/*[clinic input]
12379str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
INADA Naoki3ae20562017-01-16 20:41:20 +090012381 iterable: object
12382 /
12383
12384Concatenate any number of strings.
12385
Martin Panter91a88662017-01-24 00:30:06 +000012386The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012387The result is returned as a new string.
12388
12389Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12390[clinic start generated code]*/
12391
12392static PyObject *
12393unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012394/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395{
INADA Naoki3ae20562017-01-16 20:41:20 +090012396 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397}
12398
Martin v. Löwis18e16552006-02-15 17:27:45 +000012399static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012400unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 if (PyUnicode_READY(self) == -1)
12403 return -1;
12404 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405}
12406
INADA Naoki3ae20562017-01-16 20:41:20 +090012407/*[clinic input]
12408str.ljust as unicode_ljust
12409
12410 width: Py_ssize_t
12411 fillchar: Py_UCS4 = ' '
12412 /
12413
12414Return a left-justified string of length width.
12415
12416Padding is done using the specified fill character (default is a space).
12417[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418
12419static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012420unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12421/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012423 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012424 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425
Victor Stinnerc4b49542011-12-11 22:44:26 +010012426 if (PyUnicode_GET_LENGTH(self) >= width)
12427 return unicode_result_unchanged(self);
12428
12429 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012430}
12431
INADA Naoki3ae20562017-01-16 20:41:20 +090012432/*[clinic input]
12433str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434
INADA Naoki3ae20562017-01-16 20:41:20 +090012435Return a copy of the string converted to lowercase.
12436[clinic start generated code]*/
12437
12438static PyObject *
12439unicode_lower_impl(PyObject *self)
12440/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012442 if (PyUnicode_READY(self) == -1)
12443 return NULL;
12444 if (PyUnicode_IS_ASCII(self))
12445 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012446 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447}
12448
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012449#define LEFTSTRIP 0
12450#define RIGHTSTRIP 1
12451#define BOTHSTRIP 2
12452
12453/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012454static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012455
INADA Naoki3ae20562017-01-16 20:41:20 +090012456#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458/* externally visible for str.strip(unicode) */
12459PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012460_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012462 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 int kind;
12464 Py_ssize_t i, j, len;
12465 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012466 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12469 return NULL;
12470
12471 kind = PyUnicode_KIND(self);
12472 data = PyUnicode_DATA(self);
12473 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012474 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12476 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012477 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012478
Benjamin Peterson14339b62009-01-31 16:36:08 +000012479 i = 0;
12480 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012481 while (i < len) {
12482 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12483 if (!BLOOM(sepmask, ch))
12484 break;
12485 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12486 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 i++;
12488 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012490
Benjamin Peterson14339b62009-01-31 16:36:08 +000012491 j = len;
12492 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012493 j--;
12494 while (j >= i) {
12495 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12496 if (!BLOOM(sepmask, ch))
12497 break;
12498 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12499 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012500 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012501 }
12502
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012504 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012505
Victor Stinner7931d9a2011-11-04 00:22:48 +010012506 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507}
12508
12509PyObject*
12510PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12511{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012512 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012514 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515
Victor Stinnerde636f32011-10-01 03:55:54 +020012516 if (PyUnicode_READY(self) == -1)
12517 return NULL;
12518
Victor Stinner684d5fd2012-05-03 02:32:34 +020012519 length = PyUnicode_GET_LENGTH(self);
12520 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012521
Victor Stinner684d5fd2012-05-03 02:32:34 +020012522 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012523 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524
Victor Stinnerde636f32011-10-01 03:55:54 +020012525 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012526 PyErr_SetString(PyExc_IndexError, "string index out of range");
12527 return NULL;
12528 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012529 if (start >= length || end < start)
12530 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012531
Victor Stinner684d5fd2012-05-03 02:32:34 +020012532 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012533 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012534 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012535 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012536 }
12537 else {
12538 kind = PyUnicode_KIND(self);
12539 data = PyUnicode_1BYTE_DATA(self);
12540 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012541 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012542 length);
12543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
12546static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012547do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 Py_ssize_t len, i, j;
12550
12551 if (PyUnicode_READY(self) == -1)
12552 return NULL;
12553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012555
Victor Stinnercc7af722013-04-09 22:39:24 +020012556 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012557 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012558
12559 i = 0;
12560 if (striptype != RIGHTSTRIP) {
12561 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012562 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012563 if (!_Py_ascii_whitespace[ch])
12564 break;
12565 i++;
12566 }
12567 }
12568
12569 j = len;
12570 if (striptype != LEFTSTRIP) {
12571 j--;
12572 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012573 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012574 if (!_Py_ascii_whitespace[ch])
12575 break;
12576 j--;
12577 }
12578 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012579 }
12580 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012581 else {
12582 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012583 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012584
Victor Stinnercc7af722013-04-09 22:39:24 +020012585 i = 0;
12586 if (striptype != RIGHTSTRIP) {
12587 while (i < len) {
12588 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12589 if (!Py_UNICODE_ISSPACE(ch))
12590 break;
12591 i++;
12592 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012593 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012594
12595 j = len;
12596 if (striptype != LEFTSTRIP) {
12597 j--;
12598 while (j >= i) {
12599 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12600 if (!Py_UNICODE_ISSPACE(ch))
12601 break;
12602 j--;
12603 }
12604 j++;
12605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012606 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012607
Victor Stinner7931d9a2011-11-04 00:22:48 +010012608 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609}
12610
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012611
12612static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012613do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012614{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012615 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012616 if (PyUnicode_Check(sep))
12617 return _PyUnicode_XStrip(self, striptype, sep);
12618 else {
12619 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 "%s arg must be None or str",
12621 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012622 return NULL;
12623 }
12624 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012625
Benjamin Peterson14339b62009-01-31 16:36:08 +000012626 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012627}
12628
12629
INADA Naoki3ae20562017-01-16 20:41:20 +090012630/*[clinic input]
12631str.strip as unicode_strip
12632
12633 chars: object = None
12634 /
12635
Zachary Ware09895c22019-10-09 16:09:00 -050012636Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012637
12638If chars is given and not None, remove characters in chars instead.
12639[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012640
12641static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012642unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012643/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012644{
INADA Naoki3ae20562017-01-16 20:41:20 +090012645 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012646}
12647
12648
INADA Naoki3ae20562017-01-16 20:41:20 +090012649/*[clinic input]
12650str.lstrip as unicode_lstrip
12651
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012652 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012653 /
12654
12655Return a copy of the string with leading whitespace removed.
12656
12657If chars is given and not None, remove characters in chars instead.
12658[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012659
12660static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012661unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012662/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012663{
INADA Naoki3ae20562017-01-16 20:41:20 +090012664 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012665}
12666
12667
INADA Naoki3ae20562017-01-16 20:41:20 +090012668/*[clinic input]
12669str.rstrip as unicode_rstrip
12670
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012671 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012672 /
12673
12674Return a copy of the string with trailing whitespace removed.
12675
12676If chars is given and not None, remove characters in chars instead.
12677[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012678
12679static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012680unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012681/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012682{
INADA Naoki3ae20562017-01-16 20:41:20 +090012683 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012684}
12685
12686
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012688unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012690 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
Serhiy Storchaka05997252013-01-26 12:14:02 +020012693 if (len < 1)
12694 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
Victor Stinnerc4b49542011-12-11 22:44:26 +010012696 /* no repeat, return original string */
12697 if (len == 1)
12698 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012699
Benjamin Petersonbac79492012-01-14 13:34:47 -050012700 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 return NULL;
12702
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012703 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012704 PyErr_SetString(PyExc_OverflowError,
12705 "repeated string is too long");
12706 return NULL;
12707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012709
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012710 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711 if (!u)
12712 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012713 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012716 int kind = PyUnicode_KIND(str);
12717 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012718 if (kind == PyUnicode_1BYTE_KIND) {
12719 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012720 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012721 }
12722 else if (kind == PyUnicode_2BYTE_KIND) {
12723 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012724 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012725 ucs2[n] = fill_char;
12726 } else {
12727 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12728 assert(kind == PyUnicode_4BYTE_KIND);
12729 for (n = 0; n < len; ++n)
12730 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 }
12733 else {
12734 /* number of characters copied this far */
12735 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012736 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012738 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012742 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012743 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 }
12746
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012747 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012748 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749}
12750
Alexander Belopolsky40018472011-02-26 01:02:56 +000012751PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012752PyUnicode_Replace(PyObject *str,
12753 PyObject *substr,
12754 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012755 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012757 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12758 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012760 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761}
12762
INADA Naoki3ae20562017-01-16 20:41:20 +090012763/*[clinic input]
12764str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765
INADA Naoki3ae20562017-01-16 20:41:20 +090012766 old: unicode
12767 new: unicode
12768 count: Py_ssize_t = -1
12769 Maximum number of occurrences to replace.
12770 -1 (the default value) means replace all occurrences.
12771 /
12772
12773Return a copy with all occurrences of substring old replaced by new.
12774
12775If the optional argument count is given, only the first count occurrences are
12776replaced.
12777[clinic start generated code]*/
12778
12779static PyObject *
12780unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12781 Py_ssize_t count)
12782/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012784 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012785 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012786 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787}
12788
Alexander Belopolsky40018472011-02-26 01:02:56 +000012789static PyObject *
12790unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012792 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 Py_ssize_t isize;
12794 Py_ssize_t osize, squote, dquote, i, o;
12795 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012796 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012797 const void *idata;
12798 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012801 return NULL;
12802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 isize = PyUnicode_GET_LENGTH(unicode);
12804 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 /* Compute length of output, quote characters, and
12807 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012808 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 max = 127;
12810 squote = dquote = 0;
12811 ikind = PyUnicode_KIND(unicode);
12812 for (i = 0; i < isize; i++) {
12813 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012814 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012816 case '\'': squote++; break;
12817 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012819 incr = 2;
12820 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 default:
12822 /* Fast-path ASCII */
12823 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012824 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012826 ;
12827 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012830 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012832 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012833 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012834 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012836 if (osize > PY_SSIZE_T_MAX - incr) {
12837 PyErr_SetString(PyExc_OverflowError,
12838 "string is too long to generate repr");
12839 return NULL;
12840 }
12841 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 }
12843
12844 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012845 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012847 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 if (dquote)
12849 /* Both squote and dquote present. Use squote,
12850 and escape them */
12851 osize += squote;
12852 else
12853 quote = '"';
12854 }
Victor Stinner55c08782013-04-14 18:45:39 +020012855 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856
12857 repr = PyUnicode_New(osize, max);
12858 if (repr == NULL)
12859 return NULL;
12860 okind = PyUnicode_KIND(repr);
12861 odata = PyUnicode_DATA(repr);
12862
12863 PyUnicode_WRITE(okind, odata, 0, quote);
12864 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012865 if (unchanged) {
12866 _PyUnicode_FastCopyCharacters(repr, 1,
12867 unicode, 0,
12868 isize);
12869 }
12870 else {
12871 for (i = 0, o = 1; i < isize; i++) {
12872 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873
Victor Stinner55c08782013-04-14 18:45:39 +020012874 /* Escape quotes and backslashes */
12875 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012876 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012878 continue;
12879 }
12880
12881 /* Map special whitespace to '\t', \n', '\r' */
12882 if (ch == '\t') {
12883 PyUnicode_WRITE(okind, odata, o++, '\\');
12884 PyUnicode_WRITE(okind, odata, o++, 't');
12885 }
12886 else if (ch == '\n') {
12887 PyUnicode_WRITE(okind, odata, o++, '\\');
12888 PyUnicode_WRITE(okind, odata, o++, 'n');
12889 }
12890 else if (ch == '\r') {
12891 PyUnicode_WRITE(okind, odata, o++, '\\');
12892 PyUnicode_WRITE(okind, odata, o++, 'r');
12893 }
12894
12895 /* Map non-printable US ASCII to '\xhh' */
12896 else if (ch < ' ' || ch == 0x7F) {
12897 PyUnicode_WRITE(okind, odata, o++, '\\');
12898 PyUnicode_WRITE(okind, odata, o++, 'x');
12899 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12900 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12901 }
12902
12903 /* Copy ASCII characters as-is */
12904 else if (ch < 0x7F) {
12905 PyUnicode_WRITE(okind, odata, o++, ch);
12906 }
12907
12908 /* Non-ASCII characters */
12909 else {
12910 /* Map Unicode whitespace and control characters
12911 (categories Z* and C* except ASCII space)
12912 */
12913 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12914 PyUnicode_WRITE(okind, odata, o++, '\\');
12915 /* Map 8-bit characters to '\xhh' */
12916 if (ch <= 0xff) {
12917 PyUnicode_WRITE(okind, odata, o++, 'x');
12918 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12919 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12920 }
12921 /* Map 16-bit characters to '\uxxxx' */
12922 else if (ch <= 0xffff) {
12923 PyUnicode_WRITE(okind, odata, o++, 'u');
12924 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12925 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12928 }
12929 /* Map 21-bit characters to '\U00xxxxxx' */
12930 else {
12931 PyUnicode_WRITE(okind, odata, o++, 'U');
12932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12940 }
12941 }
12942 /* Copy characters as-is */
12943 else {
12944 PyUnicode_WRITE(okind, odata, o++, ch);
12945 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012946 }
12947 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012950 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012951 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952}
12953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012954PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012955 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956\n\
12957Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012958such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959arguments start and end are interpreted as in slice notation.\n\
12960\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012961Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962
12963static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012966 /* initialize variables to prevent gcc warning */
12967 PyObject *substring = NULL;
12968 Py_ssize_t start = 0;
12969 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012972 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012975 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012978 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 if (result == -2)
12981 return NULL;
12982
Christian Heimes217cfd12007-12-02 14:31:20 +000012983 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984}
12985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012986PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012989Return the highest index in S where substring sub is found,\n\
12990such that sub is contained within S[start:end]. Optional\n\
12991arguments start and end are interpreted as in slice notation.\n\
12992\n\
12993Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994
12995static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012998 /* initialize variables to prevent gcc warning */
12999 PyObject *substring = NULL;
13000 Py_ssize_t start = 0;
13001 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013002 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013004 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013007 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013010 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 if (result == -2)
13013 return NULL;
13014
Guido van Rossumd57fd912000-03-10 22:53:23 +000013015 if (result < 0) {
13016 PyErr_SetString(PyExc_ValueError, "substring not found");
13017 return NULL;
13018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019
Christian Heimes217cfd12007-12-02 14:31:20 +000013020 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013021}
13022
INADA Naoki3ae20562017-01-16 20:41:20 +090013023/*[clinic input]
13024str.rjust as unicode_rjust
13025
13026 width: Py_ssize_t
13027 fillchar: Py_UCS4 = ' '
13028 /
13029
13030Return a right-justified string of length width.
13031
13032Padding is done using the specified fill character (default is a space).
13033[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013034
13035static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013036unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13037/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013039 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040 return NULL;
13041
Victor Stinnerc4b49542011-12-11 22:44:26 +010013042 if (PyUnicode_GET_LENGTH(self) >= width)
13043 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044
Victor Stinnerc4b49542011-12-11 22:44:26 +010013045 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046}
13047
Alexander Belopolsky40018472011-02-26 01:02:56 +000013048PyObject *
13049PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013051 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013054 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055}
13056
INADA Naoki3ae20562017-01-16 20:41:20 +090013057/*[clinic input]
13058str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059
INADA Naoki3ae20562017-01-16 20:41:20 +090013060 sep: object = None
13061 The delimiter according which to split the string.
13062 None (the default value) means split according to any whitespace,
13063 and discard empty strings from the result.
13064 maxsplit: Py_ssize_t = -1
13065 Maximum number of splits to do.
13066 -1 (the default value) means no limit.
13067
13068Return a list of the words in the string, using sep as the delimiter string.
13069[clinic start generated code]*/
13070
13071static PyObject *
13072unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13073/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074{
INADA Naoki3ae20562017-01-16 20:41:20 +090013075 if (sep == Py_None)
13076 return split(self, NULL, maxsplit);
13077 if (PyUnicode_Check(sep))
13078 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013079
Victor Stinner998b8062018-09-12 00:23:25 +020013080 PyErr_Format(PyExc_TypeError,
13081 "must be str or None, not %.100s",
13082 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084}
13085
Thomas Wouters477c8d52006-05-27 19:21:47 +000013086PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013087PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013088{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013089 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013090 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013091 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013093
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013094 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013096
Victor Stinner14f8f022011-10-05 20:58:25 +020013097 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 len1 = PyUnicode_GET_LENGTH(str_obj);
13100 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013101 if (kind1 < kind2 || len1 < len2) {
13102 _Py_INCREF_UNICODE_EMPTY();
13103 if (!unicode_empty)
13104 out = NULL;
13105 else {
13106 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13107 Py_DECREF(unicode_empty);
13108 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013109 return out;
13110 }
13111 buf1 = PyUnicode_DATA(str_obj);
13112 buf2 = PyUnicode_DATA(sep_obj);
13113 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013114 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013115 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013116 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013119 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013121 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13122 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13123 else
13124 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 break;
13126 case PyUnicode_2BYTE_KIND:
13127 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13128 break;
13129 case PyUnicode_4BYTE_KIND:
13130 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13131 break;
13132 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013133 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013135
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013136 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013137 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013138 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013139
13140 return out;
13141}
13142
13143
13144PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013145PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013146{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013147 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013148 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013149 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013151
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013152 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013154
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013155 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013157 len1 = PyUnicode_GET_LENGTH(str_obj);
13158 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013159 if (kind1 < kind2 || len1 < len2) {
13160 _Py_INCREF_UNICODE_EMPTY();
13161 if (!unicode_empty)
13162 out = NULL;
13163 else {
13164 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13165 Py_DECREF(unicode_empty);
13166 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013167 return out;
13168 }
13169 buf1 = PyUnicode_DATA(str_obj);
13170 buf2 = PyUnicode_DATA(sep_obj);
13171 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013172 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013173 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013174 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013177 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013179 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13180 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13181 else
13182 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 break;
13184 case PyUnicode_2BYTE_KIND:
13185 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13186 break;
13187 case PyUnicode_4BYTE_KIND:
13188 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13189 break;
13190 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013191 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013193
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013194 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013195 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013196 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013197
13198 return out;
13199}
13200
INADA Naoki3ae20562017-01-16 20:41:20 +090013201/*[clinic input]
13202str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013203
INADA Naoki3ae20562017-01-16 20:41:20 +090013204 sep: object
13205 /
13206
13207Partition the string into three parts using the given separator.
13208
13209This will search for the separator in the string. If the separator is found,
13210returns a 3-tuple containing the part before the separator, the separator
13211itself, and the part after it.
13212
13213If the separator is not found, returns a 3-tuple containing the original string
13214and two empty strings.
13215[clinic start generated code]*/
13216
13217static PyObject *
13218unicode_partition(PyObject *self, PyObject *sep)
13219/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013220{
INADA Naoki3ae20562017-01-16 20:41:20 +090013221 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013222}
13223
INADA Naoki3ae20562017-01-16 20:41:20 +090013224/*[clinic input]
13225str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013226
INADA Naoki3ae20562017-01-16 20:41:20 +090013227Partition the string into three parts using the given separator.
13228
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013229This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013230the separator is found, returns a 3-tuple containing the part before the
13231separator, the separator itself, and the part after it.
13232
13233If the separator is not found, returns a 3-tuple containing two empty strings
13234and the original string.
13235[clinic start generated code]*/
13236
13237static PyObject *
13238unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013239/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013240{
INADA Naoki3ae20562017-01-16 20:41:20 +090013241 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013242}
13243
Alexander Belopolsky40018472011-02-26 01:02:56 +000013244PyObject *
13245PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013246{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013247 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013248 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013249
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013250 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013251}
13252
INADA Naoki3ae20562017-01-16 20:41:20 +090013253/*[clinic input]
13254str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013255
INADA Naoki3ae20562017-01-16 20:41:20 +090013256Return a list of the words in the string, using sep as the delimiter string.
13257
13258Splits are done starting at the end of the string and working to the front.
13259[clinic start generated code]*/
13260
13261static PyObject *
13262unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13263/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013264{
INADA Naoki3ae20562017-01-16 20:41:20 +090013265 if (sep == Py_None)
13266 return rsplit(self, NULL, maxsplit);
13267 if (PyUnicode_Check(sep))
13268 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013269
Victor Stinner998b8062018-09-12 00:23:25 +020013270 PyErr_Format(PyExc_TypeError,
13271 "must be str or None, not %.100s",
13272 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013273 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013274}
13275
INADA Naoki3ae20562017-01-16 20:41:20 +090013276/*[clinic input]
13277str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013279 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013280
13281Return a list of the lines in the string, breaking at line boundaries.
13282
13283Line breaks are not included in the resulting list unless keepends is given and
13284true.
13285[clinic start generated code]*/
13286
13287static PyObject *
13288unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013289/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013290{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013291 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292}
13293
13294static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013295PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013297 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298}
13299
INADA Naoki3ae20562017-01-16 20:41:20 +090013300/*[clinic input]
13301str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302
INADA Naoki3ae20562017-01-16 20:41:20 +090013303Convert uppercase characters to lowercase and lowercase characters to uppercase.
13304[clinic start generated code]*/
13305
13306static PyObject *
13307unicode_swapcase_impl(PyObject *self)
13308/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013310 if (PyUnicode_READY(self) == -1)
13311 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013312 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313}
13314
Larry Hastings61272b72014-01-07 12:41:53 -080013315/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013316
Larry Hastings31826802013-10-19 00:09:25 -070013317@staticmethod
13318str.maketrans as unicode_maketrans
13319
13320 x: object
13321
13322 y: unicode=NULL
13323
13324 z: unicode=NULL
13325
13326 /
13327
13328Return a translation table usable for str.translate().
13329
13330If there is only one argument, it must be a dictionary mapping Unicode
13331ordinals (integers) or characters to Unicode ordinals, strings or None.
13332Character keys will be then converted to ordinals.
13333If there are two arguments, they must be strings of equal length, and
13334in the resulting dictionary, each character in x will be mapped to the
13335character at the same position in y. If there is a third argument, it
13336must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013337[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013338
Larry Hastings31826802013-10-19 00:09:25 -070013339static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013340unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013341/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013342{
Georg Brandlceee0772007-11-27 23:48:05 +000013343 PyObject *new = NULL, *key, *value;
13344 Py_ssize_t i = 0;
13345 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013346
Georg Brandlceee0772007-11-27 23:48:05 +000013347 new = PyDict_New();
13348 if (!new)
13349 return NULL;
13350 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013352 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013353
Georg Brandlceee0772007-11-27 23:48:05 +000013354 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013355 if (!PyUnicode_Check(x)) {
13356 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13357 "be a string if there is a second argument");
13358 goto err;
13359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013361 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13362 "arguments must have equal length");
13363 goto err;
13364 }
13365 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366 x_kind = PyUnicode_KIND(x);
13367 y_kind = PyUnicode_KIND(y);
13368 x_data = PyUnicode_DATA(x);
13369 y_data = PyUnicode_DATA(y);
13370 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13371 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013372 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013373 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013374 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013375 if (!value) {
13376 Py_DECREF(key);
13377 goto err;
13378 }
Georg Brandlceee0772007-11-27 23:48:05 +000013379 res = PyDict_SetItem(new, key, value);
13380 Py_DECREF(key);
13381 Py_DECREF(value);
13382 if (res < 0)
13383 goto err;
13384 }
13385 /* create entries for deleting chars in z */
13386 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013387 z_kind = PyUnicode_KIND(z);
13388 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013389 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013391 if (!key)
13392 goto err;
13393 res = PyDict_SetItem(new, key, Py_None);
13394 Py_DECREF(key);
13395 if (res < 0)
13396 goto err;
13397 }
13398 }
13399 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013401 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013402
Georg Brandlceee0772007-11-27 23:48:05 +000013403 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013404 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013405 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13406 "to maketrans it must be a dict");
13407 goto err;
13408 }
13409 /* copy entries into the new dict, converting string keys to int keys */
13410 while (PyDict_Next(x, &i, &key, &value)) {
13411 if (PyUnicode_Check(key)) {
13412 /* convert string keys to integer keys */
13413 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013414 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013415 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13416 "table must be of length 1");
13417 goto err;
13418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 kind = PyUnicode_KIND(key);
13420 data = PyUnicode_DATA(key);
13421 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013422 if (!newkey)
13423 goto err;
13424 res = PyDict_SetItem(new, newkey, value);
13425 Py_DECREF(newkey);
13426 if (res < 0)
13427 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013428 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013429 /* just keep integer keys */
13430 if (PyDict_SetItem(new, key, value) < 0)
13431 goto err;
13432 } else {
13433 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13434 "be strings or integers");
13435 goto err;
13436 }
13437 }
13438 }
13439 return new;
13440 err:
13441 Py_DECREF(new);
13442 return NULL;
13443}
13444
INADA Naoki3ae20562017-01-16 20:41:20 +090013445/*[clinic input]
13446str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447
INADA Naoki3ae20562017-01-16 20:41:20 +090013448 table: object
13449 Translation table, which must be a mapping of Unicode ordinals to
13450 Unicode ordinals, strings, or None.
13451 /
13452
13453Replace each character in the string using the given translation table.
13454
13455The table must implement lookup/indexing via __getitem__, for instance a
13456dictionary or list. If this operation raises LookupError, the character is
13457left untouched. Characters mapped to None are deleted.
13458[clinic start generated code]*/
13459
13460static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013461unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013462/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013464 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465}
13466
INADA Naoki3ae20562017-01-16 20:41:20 +090013467/*[clinic input]
13468str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013469
INADA Naoki3ae20562017-01-16 20:41:20 +090013470Return a copy of the string converted to uppercase.
13471[clinic start generated code]*/
13472
13473static PyObject *
13474unicode_upper_impl(PyObject *self)
13475/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013477 if (PyUnicode_READY(self) == -1)
13478 return NULL;
13479 if (PyUnicode_IS_ASCII(self))
13480 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013481 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013482}
13483
INADA Naoki3ae20562017-01-16 20:41:20 +090013484/*[clinic input]
13485str.zfill as unicode_zfill
13486
13487 width: Py_ssize_t
13488 /
13489
13490Pad a numeric string with zeros on the left, to fill a field of the given width.
13491
13492The string is never truncated.
13493[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013494
13495static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013496unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013497/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013498{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013499 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013500 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013501 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013502 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 Py_UCS4 chr;
13504
Benjamin Petersonbac79492012-01-14 13:34:47 -050013505 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507
Victor Stinnerc4b49542011-12-11 22:44:26 +010013508 if (PyUnicode_GET_LENGTH(self) >= width)
13509 return unicode_result_unchanged(self);
13510
13511 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013512
13513 u = pad(self, fill, 0, '0');
13514
Walter Dörwald068325e2002-04-15 13:36:47 +000013515 if (u == NULL)
13516 return NULL;
13517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 kind = PyUnicode_KIND(u);
13519 data = PyUnicode_DATA(u);
13520 chr = PyUnicode_READ(kind, data, fill);
13521
13522 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013524 PyUnicode_WRITE(kind, data, 0, chr);
13525 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013526 }
13527
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013528 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013529 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013530}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013531
13532#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013533static PyObject *
13534unicode__decimal2ascii(PyObject *self)
13535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013536 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013537}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013538#endif
13539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013540PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013542\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013543Return True if S starts with the specified prefix, False otherwise.\n\
13544With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013545With optional end, stop comparing S at that position.\n\
13546prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013547
13548static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013549unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013551{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013552 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013553 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013554 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013555 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013556 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013557
Jesus Ceaac451502011-04-20 17:09:23 +020013558 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013560 if (PyTuple_Check(subobj)) {
13561 Py_ssize_t i;
13562 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013563 substring = PyTuple_GET_ITEM(subobj, i);
13564 if (!PyUnicode_Check(substring)) {
13565 PyErr_Format(PyExc_TypeError,
13566 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013567 "not %.100s",
13568 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013569 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013570 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013571 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013572 if (result == -1)
13573 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013574 if (result) {
13575 Py_RETURN_TRUE;
13576 }
13577 }
13578 /* nothing matched */
13579 Py_RETURN_FALSE;
13580 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013581 if (!PyUnicode_Check(subobj)) {
13582 PyErr_Format(PyExc_TypeError,
13583 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013584 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013586 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013587 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013588 if (result == -1)
13589 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013590 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013591}
13592
13593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013594PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013596\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013597Return True if S ends with the specified suffix, False otherwise.\n\
13598With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013599With optional end, stop comparing S at that position.\n\
13600suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013601
13602static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013603unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013605{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013606 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013607 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013608 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013609 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013610 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013611
Jesus Ceaac451502011-04-20 17:09:23 +020013612 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013614 if (PyTuple_Check(subobj)) {
13615 Py_ssize_t i;
13616 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013617 substring = PyTuple_GET_ITEM(subobj, i);
13618 if (!PyUnicode_Check(substring)) {
13619 PyErr_Format(PyExc_TypeError,
13620 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013621 "not %.100s",
13622 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013623 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013624 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013625 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013626 if (result == -1)
13627 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013628 if (result) {
13629 Py_RETURN_TRUE;
13630 }
13631 }
13632 Py_RETURN_FALSE;
13633 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013634 if (!PyUnicode_Check(subobj)) {
13635 PyErr_Format(PyExc_TypeError,
13636 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013637 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013639 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013640 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013641 if (result == -1)
13642 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013643 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013644}
13645
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013646static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013647_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013648{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013649 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13650 writer->data = PyUnicode_DATA(writer->buffer);
13651
13652 if (!writer->readonly) {
13653 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013654 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013655 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013656 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013657 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13658 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13659 writer->kind = PyUnicode_WCHAR_KIND;
13660 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13661
Victor Stinner8f674cc2013-04-17 23:02:17 +020013662 /* Copy-on-write mode: set buffer size to 0 so
13663 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13664 * next write. */
13665 writer->size = 0;
13666 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013667}
13668
Victor Stinnerd3f08822012-05-29 12:57:52 +020013669void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013670_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013671{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013672 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013673
13674 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013675 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013676
13677 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13678 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13679 writer->kind = PyUnicode_WCHAR_KIND;
13680 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013681}
13682
Inada Naoki770847a2019-06-24 12:30:24 +090013683// Initialize _PyUnicodeWriter with initial buffer
13684static inline void
13685_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13686{
13687 memset(writer, 0, sizeof(*writer));
13688 writer->buffer = buffer;
13689 _PyUnicodeWriter_Update(writer);
13690 writer->min_length = writer->size;
13691}
13692
Victor Stinnerd3f08822012-05-29 12:57:52 +020013693int
13694_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13695 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013696{
13697 Py_ssize_t newlen;
13698 PyObject *newbuffer;
13699
Victor Stinner2740e462016-09-06 16:58:36 -070013700 assert(maxchar <= MAX_UNICODE);
13701
Victor Stinnerca9381e2015-09-22 00:58:32 +020013702 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013703 assert((maxchar > writer->maxchar && length >= 0)
13704 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013705
Victor Stinner202fdca2012-05-07 12:47:02 +020013706 if (length > PY_SSIZE_T_MAX - writer->pos) {
13707 PyErr_NoMemory();
13708 return -1;
13709 }
13710 newlen = writer->pos + length;
13711
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013712 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013713
Victor Stinnerd3f08822012-05-29 12:57:52 +020013714 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013715 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013716 if (writer->overallocate
13717 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13718 /* overallocate to limit the number of realloc() */
13719 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013720 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013721 if (newlen < writer->min_length)
13722 newlen = writer->min_length;
13723
Victor Stinnerd3f08822012-05-29 12:57:52 +020013724 writer->buffer = PyUnicode_New(newlen, maxchar);
13725 if (writer->buffer == NULL)
13726 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013727 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013728 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013729 if (writer->overallocate
13730 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13731 /* overallocate to limit the number of realloc() */
13732 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013733 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013734 if (newlen < writer->min_length)
13735 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013736
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013737 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013738 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013739 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013740 newbuffer = PyUnicode_New(newlen, maxchar);
13741 if (newbuffer == NULL)
13742 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013743 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13744 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013745 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013746 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013747 }
13748 else {
13749 newbuffer = resize_compact(writer->buffer, newlen);
13750 if (newbuffer == NULL)
13751 return -1;
13752 }
13753 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013754 }
13755 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013756 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013757 newbuffer = PyUnicode_New(writer->size, maxchar);
13758 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013759 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013760 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13761 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013762 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013763 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013764 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013765 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013766
13767#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013768}
13769
Victor Stinnerca9381e2015-09-22 00:58:32 +020013770int
13771_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13772 enum PyUnicode_Kind kind)
13773{
13774 Py_UCS4 maxchar;
13775
13776 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13777 assert(writer->kind < kind);
13778
13779 switch (kind)
13780 {
13781 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13782 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13783 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13784 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013785 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013786 }
13787
13788 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13789}
13790
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013791static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013792_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013793{
Victor Stinner2740e462016-09-06 16:58:36 -070013794 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013795 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13796 return -1;
13797 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13798 writer->pos++;
13799 return 0;
13800}
13801
13802int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013803_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13804{
13805 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13806}
13807
13808int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013809_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13810{
13811 Py_UCS4 maxchar;
13812 Py_ssize_t len;
13813
13814 if (PyUnicode_READY(str) == -1)
13815 return -1;
13816 len = PyUnicode_GET_LENGTH(str);
13817 if (len == 0)
13818 return 0;
13819 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13820 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013821 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013822 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013823 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013824 Py_INCREF(str);
13825 writer->buffer = str;
13826 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013827 writer->pos += len;
13828 return 0;
13829 }
13830 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13831 return -1;
13832 }
13833 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13834 str, 0, len);
13835 writer->pos += len;
13836 return 0;
13837}
13838
Victor Stinnere215d962012-10-06 23:03:36 +020013839int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013840_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13841 Py_ssize_t start, Py_ssize_t end)
13842{
13843 Py_UCS4 maxchar;
13844 Py_ssize_t len;
13845
13846 if (PyUnicode_READY(str) == -1)
13847 return -1;
13848
13849 assert(0 <= start);
13850 assert(end <= PyUnicode_GET_LENGTH(str));
13851 assert(start <= end);
13852
13853 if (end == 0)
13854 return 0;
13855
13856 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13857 return _PyUnicodeWriter_WriteStr(writer, str);
13858
13859 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13860 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13861 else
13862 maxchar = writer->maxchar;
13863 len = end - start;
13864
13865 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13866 return -1;
13867
13868 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13869 str, start, len);
13870 writer->pos += len;
13871 return 0;
13872}
13873
13874int
Victor Stinner4a587072013-11-19 12:54:53 +010013875_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13876 const char *ascii, Py_ssize_t len)
13877{
13878 if (len == -1)
13879 len = strlen(ascii);
13880
Andy Lestere6be9b52020-02-11 20:28:35 -060013881 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010013882
13883 if (writer->buffer == NULL && !writer->overallocate) {
13884 PyObject *str;
13885
13886 str = _PyUnicode_FromASCII(ascii, len);
13887 if (str == NULL)
13888 return -1;
13889
13890 writer->readonly = 1;
13891 writer->buffer = str;
13892 _PyUnicodeWriter_Update(writer);
13893 writer->pos += len;
13894 return 0;
13895 }
13896
13897 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13898 return -1;
13899
13900 switch (writer->kind)
13901 {
13902 case PyUnicode_1BYTE_KIND:
13903 {
13904 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13905 Py_UCS1 *data = writer->data;
13906
Christian Heimesf051e432016-09-13 20:22:02 +020013907 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013908 break;
13909 }
13910 case PyUnicode_2BYTE_KIND:
13911 {
13912 _PyUnicode_CONVERT_BYTES(
13913 Py_UCS1, Py_UCS2,
13914 ascii, ascii + len,
13915 (Py_UCS2 *)writer->data + writer->pos);
13916 break;
13917 }
13918 case PyUnicode_4BYTE_KIND:
13919 {
13920 _PyUnicode_CONVERT_BYTES(
13921 Py_UCS1, Py_UCS4,
13922 ascii, ascii + len,
13923 (Py_UCS4 *)writer->data + writer->pos);
13924 break;
13925 }
13926 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013927 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013928 }
13929
13930 writer->pos += len;
13931 return 0;
13932}
13933
13934int
13935_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13936 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013937{
13938 Py_UCS4 maxchar;
13939
Andy Lestere6be9b52020-02-11 20:28:35 -060013940 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020013941 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13942 return -1;
13943 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13944 writer->pos += len;
13945 return 0;
13946}
13947
Victor Stinnerd3f08822012-05-29 12:57:52 +020013948PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013949_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013950{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013951 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013952
Victor Stinnerd3f08822012-05-29 12:57:52 +020013953 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013954 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013955 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013956 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013957
13958 str = writer->buffer;
13959 writer->buffer = NULL;
13960
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013961 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013962 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13963 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013964 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013965
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013966 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13967 PyObject *str2;
13968 str2 = resize_compact(str, writer->pos);
13969 if (str2 == NULL) {
13970 Py_DECREF(str);
13971 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013972 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013973 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013974 }
13975
Victor Stinner15a0bd32013-07-08 22:29:55 +020013976 assert(_PyUnicode_CheckConsistency(str, 1));
13977 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013978}
13979
Victor Stinnerd3f08822012-05-29 12:57:52 +020013980void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013981_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013982{
13983 Py_CLEAR(writer->buffer);
13984}
13985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013986#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013987
13988PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013990\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013991Return a formatted version of S, using substitutions from args and kwargs.\n\
13992The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013993
Eric Smith27bbca62010-11-04 17:06:58 +000013994PyDoc_STRVAR(format_map__doc__,
13995 "S.format_map(mapping) -> str\n\
13996\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013997Return a formatted version of S, using substitutions from mapping.\n\
13998The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013999
INADA Naoki3ae20562017-01-16 20:41:20 +090014000/*[clinic input]
14001str.__format__ as unicode___format__
14002
14003 format_spec: unicode
14004 /
14005
14006Return a formatted version of the string as described by format_spec.
14007[clinic start generated code]*/
14008
Eric Smith4a7d76d2008-05-30 18:10:19 +000014009static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014010unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014011/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014012{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014013 _PyUnicodeWriter writer;
14014 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014015
Victor Stinnerd3f08822012-05-29 12:57:52 +020014016 if (PyUnicode_READY(self) == -1)
14017 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014018 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014019 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14020 self, format_spec, 0,
14021 PyUnicode_GET_LENGTH(format_spec));
14022 if (ret == -1) {
14023 _PyUnicodeWriter_Dealloc(&writer);
14024 return NULL;
14025 }
14026 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014027}
14028
INADA Naoki3ae20562017-01-16 20:41:20 +090014029/*[clinic input]
14030str.__sizeof__ as unicode_sizeof
14031
14032Return the size of the string in memory, in bytes.
14033[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014034
14035static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014036unicode_sizeof_impl(PyObject *self)
14037/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014039 Py_ssize_t size;
14040
14041 /* If it's a compact object, account for base structure +
14042 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014043 if (PyUnicode_IS_COMPACT_ASCII(self))
14044 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14045 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014046 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014047 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014048 else {
14049 /* If it is a two-block object, account for base object, and
14050 for character block if present. */
14051 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014052 if (_PyUnicode_DATA_ANY(self))
14053 size += (PyUnicode_GET_LENGTH(self) + 1) *
14054 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014055 }
14056 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014057 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014058 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14059 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14060 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14061 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014062
14063 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014064}
14065
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014066static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014067unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014068{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014069 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014070 if (!copy)
14071 return NULL;
14072 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014073}
14074
Guido van Rossumd57fd912000-03-10 22:53:23 +000014075static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014076 UNICODE_ENCODE_METHODDEF
14077 UNICODE_REPLACE_METHODDEF
14078 UNICODE_SPLIT_METHODDEF
14079 UNICODE_RSPLIT_METHODDEF
14080 UNICODE_JOIN_METHODDEF
14081 UNICODE_CAPITALIZE_METHODDEF
14082 UNICODE_CASEFOLD_METHODDEF
14083 UNICODE_TITLE_METHODDEF
14084 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014085 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014086 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014087 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014088 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014089 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014090 UNICODE_LJUST_METHODDEF
14091 UNICODE_LOWER_METHODDEF
14092 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014093 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14094 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014095 UNICODE_RJUST_METHODDEF
14096 UNICODE_RSTRIP_METHODDEF
14097 UNICODE_RPARTITION_METHODDEF
14098 UNICODE_SPLITLINES_METHODDEF
14099 UNICODE_STRIP_METHODDEF
14100 UNICODE_SWAPCASE_METHODDEF
14101 UNICODE_TRANSLATE_METHODDEF
14102 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014103 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14104 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014105 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014106 UNICODE_ISLOWER_METHODDEF
14107 UNICODE_ISUPPER_METHODDEF
14108 UNICODE_ISTITLE_METHODDEF
14109 UNICODE_ISSPACE_METHODDEF
14110 UNICODE_ISDECIMAL_METHODDEF
14111 UNICODE_ISDIGIT_METHODDEF
14112 UNICODE_ISNUMERIC_METHODDEF
14113 UNICODE_ISALPHA_METHODDEF
14114 UNICODE_ISALNUM_METHODDEF
14115 UNICODE_ISIDENTIFIER_METHODDEF
14116 UNICODE_ISPRINTABLE_METHODDEF
14117 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014118 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014119 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014120 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014121 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014122 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014123#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014124 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014125 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014126#endif
14127
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014128 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014129 {NULL, NULL}
14130};
14131
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014132static PyObject *
14133unicode_mod(PyObject *v, PyObject *w)
14134{
Brian Curtindfc80e32011-08-10 20:28:54 -050014135 if (!PyUnicode_Check(v))
14136 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014138}
14139
14140static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 0, /*nb_add*/
14142 0, /*nb_subtract*/
14143 0, /*nb_multiply*/
14144 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014145};
14146
Guido van Rossumd57fd912000-03-10 22:53:23 +000014147static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014148 (lenfunc) unicode_length, /* sq_length */
14149 PyUnicode_Concat, /* sq_concat */
14150 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14151 (ssizeargfunc) unicode_getitem, /* sq_item */
14152 0, /* sq_slice */
14153 0, /* sq_ass_item */
14154 0, /* sq_ass_slice */
14155 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014156};
14157
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014158static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014159unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014161 if (PyUnicode_READY(self) == -1)
14162 return NULL;
14163
Victor Stinnera15e2602020-04-08 02:01:56 +020014164 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014165 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014166 if (i == -1 && PyErr_Occurred())
14167 return NULL;
14168 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014169 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014170 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014171 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014172 Py_ssize_t start, stop, step, slicelength, i;
14173 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014174 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014175 const void *src_data;
14176 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014177 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014178 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014179
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014180 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014181 return NULL;
14182 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014183 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14184 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014185
14186 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014187 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014188 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014189 slicelength == PyUnicode_GET_LENGTH(self)) {
14190 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014191 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014192 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014193 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014194 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014195 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014196 src_kind = PyUnicode_KIND(self);
14197 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014198 if (!PyUnicode_IS_ASCII(self)) {
14199 kind_limit = kind_maxchar_limit(src_kind);
14200 max_char = 0;
14201 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14202 ch = PyUnicode_READ(src_kind, src_data, cur);
14203 if (ch > max_char) {
14204 max_char = ch;
14205 if (max_char >= kind_limit)
14206 break;
14207 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014208 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014209 }
Victor Stinner55c99112011-10-13 01:17:06 +020014210 else
14211 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014212 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014213 if (result == NULL)
14214 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014215 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014216 dest_data = PyUnicode_DATA(result);
14217
14218 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014219 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14220 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014221 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014222 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014223 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014224 } else {
14225 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14226 return NULL;
14227 }
14228}
14229
14230static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014231 (lenfunc)unicode_length, /* mp_length */
14232 (binaryfunc)unicode_subscript, /* mp_subscript */
14233 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014234};
14235
Guido van Rossumd57fd912000-03-10 22:53:23 +000014236
Guido van Rossumd57fd912000-03-10 22:53:23 +000014237/* Helpers for PyUnicode_Format() */
14238
Victor Stinnera47082312012-10-04 02:19:54 +020014239struct unicode_formatter_t {
14240 PyObject *args;
14241 int args_owned;
14242 Py_ssize_t arglen, argidx;
14243 PyObject *dict;
14244
14245 enum PyUnicode_Kind fmtkind;
14246 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014247 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014248 PyObject *fmtstr;
14249
14250 _PyUnicodeWriter writer;
14251};
14252
14253struct unicode_format_arg_t {
14254 Py_UCS4 ch;
14255 int flags;
14256 Py_ssize_t width;
14257 int prec;
14258 int sign;
14259};
14260
Guido van Rossumd57fd912000-03-10 22:53:23 +000014261static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014262unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014263{
Victor Stinnera47082312012-10-04 02:19:54 +020014264 Py_ssize_t argidx = ctx->argidx;
14265
14266 if (argidx < ctx->arglen) {
14267 ctx->argidx++;
14268 if (ctx->arglen < 0)
14269 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014270 else
Victor Stinnera47082312012-10-04 02:19:54 +020014271 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014272 }
14273 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014274 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014275 return NULL;
14276}
14277
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014278/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014279
Victor Stinnera47082312012-10-04 02:19:54 +020014280/* Format a float into the writer if the writer is not NULL, or into *p_output
14281 otherwise.
14282
14283 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014284static int
Victor Stinnera47082312012-10-04 02:19:54 +020014285formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14286 PyObject **p_output,
14287 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014288{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014289 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014290 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014291 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014292 int prec;
14293 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014294
Guido van Rossumd57fd912000-03-10 22:53:23 +000014295 x = PyFloat_AsDouble(v);
14296 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014297 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014298
Victor Stinnera47082312012-10-04 02:19:54 +020014299 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014300 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014301 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014302
Victor Stinnera47082312012-10-04 02:19:54 +020014303 if (arg->flags & F_ALT)
14304 dtoa_flags = Py_DTSF_ALT;
14305 else
14306 dtoa_flags = 0;
14307 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014308 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014309 return -1;
14310 len = strlen(p);
14311 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014312 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014313 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014314 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014315 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014316 }
14317 else
14318 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014319 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014320 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014321}
14322
Victor Stinnerd0880d52012-04-27 23:40:13 +020014323/* formatlong() emulates the format codes d, u, o, x and X, and
14324 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14325 * Python's regular ints.
14326 * Return value: a new PyUnicodeObject*, or NULL if error.
14327 * The output string is of the form
14328 * "-"? ("0x" | "0X")? digit+
14329 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14330 * set in flags. The case of hex digits will be correct,
14331 * There will be at least prec digits, zero-filled on the left if
14332 * necessary to get that many.
14333 * val object to be converted
14334 * flags bitmask of format flags; only F_ALT is looked at
14335 * prec minimum number of digits; 0-fill on left if needed
14336 * type a character in [duoxX]; u acts the same as d
14337 *
14338 * CAUTION: o, x and X conversions on regular ints can never
14339 * produce a '-' sign, but can for Python's unbounded ints.
14340 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014341PyObject *
14342_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014343{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014344 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014346 Py_ssize_t i;
14347 int sign; /* 1 if '-', else 0 */
14348 int len; /* number of characters */
14349 Py_ssize_t llen;
14350 int numdigits; /* len == numnondigits + numdigits */
14351 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014352
Victor Stinnerd0880d52012-04-27 23:40:13 +020014353 /* Avoid exceeding SSIZE_T_MAX */
14354 if (prec > INT_MAX-3) {
14355 PyErr_SetString(PyExc_OverflowError,
14356 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014358 }
14359
14360 assert(PyLong_Check(val));
14361
14362 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014363 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014364 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014365 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014366 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014367 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014368 /* int and int subclasses should print numerically when a numeric */
14369 /* format code is used (see issue18780) */
14370 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014371 break;
14372 case 'o':
14373 numnondigits = 2;
14374 result = PyNumber_ToBase(val, 8);
14375 break;
14376 case 'x':
14377 case 'X':
14378 numnondigits = 2;
14379 result = PyNumber_ToBase(val, 16);
14380 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014381 }
14382 if (!result)
14383 return NULL;
14384
14385 assert(unicode_modifiable(result));
14386 assert(PyUnicode_IS_READY(result));
14387 assert(PyUnicode_IS_ASCII(result));
14388
14389 /* To modify the string in-place, there can only be one reference. */
14390 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014391 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014392 PyErr_BadInternalCall();
14393 return NULL;
14394 }
14395 buf = PyUnicode_DATA(result);
14396 llen = PyUnicode_GET_LENGTH(result);
14397 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014398 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014399 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014400 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014401 return NULL;
14402 }
14403 len = (int)llen;
14404 sign = buf[0] == '-';
14405 numnondigits += sign;
14406 numdigits = len - numnondigits;
14407 assert(numdigits > 0);
14408
14409 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014410 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014411 (type == 'o' || type == 'x' || type == 'X'))) {
14412 assert(buf[sign] == '0');
14413 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14414 buf[sign+1] == 'o');
14415 numnondigits -= 2;
14416 buf += 2;
14417 len -= 2;
14418 if (sign)
14419 buf[0] = '-';
14420 assert(len == numnondigits + numdigits);
14421 assert(numdigits > 0);
14422 }
14423
14424 /* Fill with leading zeroes to meet minimum width. */
14425 if (prec > numdigits) {
14426 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14427 numnondigits + prec);
14428 char *b1;
14429 if (!r1) {
14430 Py_DECREF(result);
14431 return NULL;
14432 }
14433 b1 = PyBytes_AS_STRING(r1);
14434 for (i = 0; i < numnondigits; ++i)
14435 *b1++ = *buf++;
14436 for (i = 0; i < prec - numdigits; i++)
14437 *b1++ = '0';
14438 for (i = 0; i < numdigits; i++)
14439 *b1++ = *buf++;
14440 *b1 = '\0';
14441 Py_DECREF(result);
14442 result = r1;
14443 buf = PyBytes_AS_STRING(result);
14444 len = numnondigits + prec;
14445 }
14446
14447 /* Fix up case for hex conversions. */
14448 if (type == 'X') {
14449 /* Need to convert all lower case letters to upper case.
14450 and need to convert 0x to 0X (and -0x to -0X). */
14451 for (i = 0; i < len; i++)
14452 if (buf[i] >= 'a' && buf[i] <= 'x')
14453 buf[i] -= 'a'-'A';
14454 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014455 if (!PyUnicode_Check(result)
14456 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014457 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014458 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014459 Py_DECREF(result);
14460 result = unicode;
14461 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014462 else if (len != PyUnicode_GET_LENGTH(result)) {
14463 if (PyUnicode_Resize(&result, len) < 0)
14464 Py_CLEAR(result);
14465 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014467}
14468
Ethan Furmandf3ed242014-01-05 06:50:30 -080014469/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014470 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014471 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014472 * -1 and raise an exception on error */
14473static int
Victor Stinnera47082312012-10-04 02:19:54 +020014474mainformatlong(PyObject *v,
14475 struct unicode_format_arg_t *arg,
14476 PyObject **p_output,
14477 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014478{
14479 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014480 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014481
14482 if (!PyNumber_Check(v))
14483 goto wrongtype;
14484
Ethan Furman9ab74802014-03-21 06:38:46 -070014485 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014486 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014487 if (type == 'o' || type == 'x' || type == 'X') {
14488 iobj = PyNumber_Index(v);
14489 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014490 if (PyErr_ExceptionMatches(PyExc_TypeError))
14491 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014492 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014493 }
14494 }
14495 else {
14496 iobj = PyNumber_Long(v);
14497 if (iobj == NULL ) {
14498 if (PyErr_ExceptionMatches(PyExc_TypeError))
14499 goto wrongtype;
14500 return -1;
14501 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014502 }
14503 assert(PyLong_Check(iobj));
14504 }
14505 else {
14506 iobj = v;
14507 Py_INCREF(iobj);
14508 }
14509
14510 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014511 && arg->width == -1 && arg->prec == -1
14512 && !(arg->flags & (F_SIGN | F_BLANK))
14513 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014514 {
14515 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014516 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014517 int base;
14518
Victor Stinnera47082312012-10-04 02:19:54 +020014519 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014520 {
14521 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014522 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014523 case 'd':
14524 case 'i':
14525 case 'u':
14526 base = 10;
14527 break;
14528 case 'o':
14529 base = 8;
14530 break;
14531 case 'x':
14532 case 'X':
14533 base = 16;
14534 break;
14535 }
14536
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014537 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14538 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014539 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014540 }
14541 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014542 return 1;
14543 }
14544
Ethan Furmanb95b5612015-01-23 20:05:18 -080014545 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014546 Py_DECREF(iobj);
14547 if (res == NULL)
14548 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014549 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014550 return 0;
14551
14552wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014553 switch(type)
14554 {
14555 case 'o':
14556 case 'x':
14557 case 'X':
14558 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014559 "%%%c format: an integer is required, "
14560 "not %.200s",
14561 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014562 break;
14563 default:
14564 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014565 "%%%c format: a number is required, "
14566 "not %.200s",
14567 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014568 break;
14569 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014570 return -1;
14571}
14572
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014573static Py_UCS4
14574formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014575{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014576 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014577 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014578 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014579 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014580 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014581 goto onError;
14582 }
14583 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014584 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014585 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014586 /* make sure number is a type of integer */
14587 if (!PyLong_Check(v)) {
14588 iobj = PyNumber_Index(v);
14589 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014590 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014591 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014592 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014593 Py_DECREF(iobj);
14594 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014595 else {
14596 x = PyLong_AsLong(v);
14597 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014598 if (x == -1 && PyErr_Occurred())
14599 goto onError;
14600
Victor Stinner8faf8212011-12-08 22:14:11 +010014601 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014602 PyErr_SetString(PyExc_OverflowError,
14603 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014604 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014605 }
14606
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014607 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014608 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014609
Benjamin Peterson29060642009-01-31 22:14:21 +000014610 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014611 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014612 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014613 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014614}
14615
Victor Stinnera47082312012-10-04 02:19:54 +020014616/* Parse options of an argument: flags, width, precision.
14617 Handle also "%(name)" syntax.
14618
14619 Return 0 if the argument has been formatted into arg->str.
14620 Return 1 if the argument has been written into ctx->writer,
14621 Raise an exception and return -1 on error. */
14622static int
14623unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14624 struct unicode_format_arg_t *arg)
14625{
14626#define FORMAT_READ(ctx) \
14627 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14628
14629 PyObject *v;
14630
Victor Stinnera47082312012-10-04 02:19:54 +020014631 if (arg->ch == '(') {
14632 /* Get argument value from a dictionary. Example: "%(name)s". */
14633 Py_ssize_t keystart;
14634 Py_ssize_t keylen;
14635 PyObject *key;
14636 int pcount = 1;
14637
14638 if (ctx->dict == NULL) {
14639 PyErr_SetString(PyExc_TypeError,
14640 "format requires a mapping");
14641 return -1;
14642 }
14643 ++ctx->fmtpos;
14644 --ctx->fmtcnt;
14645 keystart = ctx->fmtpos;
14646 /* Skip over balanced parentheses */
14647 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14648 arg->ch = FORMAT_READ(ctx);
14649 if (arg->ch == ')')
14650 --pcount;
14651 else if (arg->ch == '(')
14652 ++pcount;
14653 ctx->fmtpos++;
14654 }
14655 keylen = ctx->fmtpos - keystart - 1;
14656 if (ctx->fmtcnt < 0 || pcount > 0) {
14657 PyErr_SetString(PyExc_ValueError,
14658 "incomplete format key");
14659 return -1;
14660 }
14661 key = PyUnicode_Substring(ctx->fmtstr,
14662 keystart, keystart + keylen);
14663 if (key == NULL)
14664 return -1;
14665 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014666 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014667 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014668 }
14669 ctx->args = PyObject_GetItem(ctx->dict, key);
14670 Py_DECREF(key);
14671 if (ctx->args == NULL)
14672 return -1;
14673 ctx->args_owned = 1;
14674 ctx->arglen = -1;
14675 ctx->argidx = -2;
14676 }
14677
14678 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014679 while (--ctx->fmtcnt >= 0) {
14680 arg->ch = FORMAT_READ(ctx);
14681 ctx->fmtpos++;
14682 switch (arg->ch) {
14683 case '-': arg->flags |= F_LJUST; continue;
14684 case '+': arg->flags |= F_SIGN; continue;
14685 case ' ': arg->flags |= F_BLANK; continue;
14686 case '#': arg->flags |= F_ALT; continue;
14687 case '0': arg->flags |= F_ZERO; continue;
14688 }
14689 break;
14690 }
14691
14692 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014693 if (arg->ch == '*') {
14694 v = unicode_format_getnextarg(ctx);
14695 if (v == NULL)
14696 return -1;
14697 if (!PyLong_Check(v)) {
14698 PyErr_SetString(PyExc_TypeError,
14699 "* wants int");
14700 return -1;
14701 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014702 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014703 if (arg->width == -1 && PyErr_Occurred())
14704 return -1;
14705 if (arg->width < 0) {
14706 arg->flags |= F_LJUST;
14707 arg->width = -arg->width;
14708 }
14709 if (--ctx->fmtcnt >= 0) {
14710 arg->ch = FORMAT_READ(ctx);
14711 ctx->fmtpos++;
14712 }
14713 }
14714 else if (arg->ch >= '0' && arg->ch <= '9') {
14715 arg->width = arg->ch - '0';
14716 while (--ctx->fmtcnt >= 0) {
14717 arg->ch = FORMAT_READ(ctx);
14718 ctx->fmtpos++;
14719 if (arg->ch < '0' || arg->ch > '9')
14720 break;
14721 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14722 mixing signed and unsigned comparison. Since arg->ch is between
14723 '0' and '9', casting to int is safe. */
14724 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14725 PyErr_SetString(PyExc_ValueError,
14726 "width too big");
14727 return -1;
14728 }
14729 arg->width = arg->width*10 + (arg->ch - '0');
14730 }
14731 }
14732
14733 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014734 if (arg->ch == '.') {
14735 arg->prec = 0;
14736 if (--ctx->fmtcnt >= 0) {
14737 arg->ch = FORMAT_READ(ctx);
14738 ctx->fmtpos++;
14739 }
14740 if (arg->ch == '*') {
14741 v = unicode_format_getnextarg(ctx);
14742 if (v == NULL)
14743 return -1;
14744 if (!PyLong_Check(v)) {
14745 PyErr_SetString(PyExc_TypeError,
14746 "* wants int");
14747 return -1;
14748 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014749 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014750 if (arg->prec == -1 && PyErr_Occurred())
14751 return -1;
14752 if (arg->prec < 0)
14753 arg->prec = 0;
14754 if (--ctx->fmtcnt >= 0) {
14755 arg->ch = FORMAT_READ(ctx);
14756 ctx->fmtpos++;
14757 }
14758 }
14759 else if (arg->ch >= '0' && arg->ch <= '9') {
14760 arg->prec = arg->ch - '0';
14761 while (--ctx->fmtcnt >= 0) {
14762 arg->ch = FORMAT_READ(ctx);
14763 ctx->fmtpos++;
14764 if (arg->ch < '0' || arg->ch > '9')
14765 break;
14766 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14767 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014768 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014769 return -1;
14770 }
14771 arg->prec = arg->prec*10 + (arg->ch - '0');
14772 }
14773 }
14774 }
14775
14776 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14777 if (ctx->fmtcnt >= 0) {
14778 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14779 if (--ctx->fmtcnt >= 0) {
14780 arg->ch = FORMAT_READ(ctx);
14781 ctx->fmtpos++;
14782 }
14783 }
14784 }
14785 if (ctx->fmtcnt < 0) {
14786 PyErr_SetString(PyExc_ValueError,
14787 "incomplete format");
14788 return -1;
14789 }
14790 return 0;
14791
14792#undef FORMAT_READ
14793}
14794
14795/* Format one argument. Supported conversion specifiers:
14796
14797 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014798 - "i", "d", "u": int or float
14799 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014800 - "e", "E", "f", "F", "g", "G": float
14801 - "c": int or str (1 character)
14802
Victor Stinner8dbd4212012-12-04 09:30:24 +010014803 When possible, the output is written directly into the Unicode writer
14804 (ctx->writer). A string is created when padding is required.
14805
Victor Stinnera47082312012-10-04 02:19:54 +020014806 Return 0 if the argument has been formatted into *p_str,
14807 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014808 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014809static int
14810unicode_format_arg_format(struct unicode_formatter_t *ctx,
14811 struct unicode_format_arg_t *arg,
14812 PyObject **p_str)
14813{
14814 PyObject *v;
14815 _PyUnicodeWriter *writer = &ctx->writer;
14816
14817 if (ctx->fmtcnt == 0)
14818 ctx->writer.overallocate = 0;
14819
Victor Stinnera47082312012-10-04 02:19:54 +020014820 v = unicode_format_getnextarg(ctx);
14821 if (v == NULL)
14822 return -1;
14823
Victor Stinnera47082312012-10-04 02:19:54 +020014824
14825 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014826 case 's':
14827 case 'r':
14828 case 'a':
14829 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14830 /* Fast path */
14831 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14832 return -1;
14833 return 1;
14834 }
14835
14836 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14837 *p_str = v;
14838 Py_INCREF(*p_str);
14839 }
14840 else {
14841 if (arg->ch == 's')
14842 *p_str = PyObject_Str(v);
14843 else if (arg->ch == 'r')
14844 *p_str = PyObject_Repr(v);
14845 else
14846 *p_str = PyObject_ASCII(v);
14847 }
14848 break;
14849
14850 case 'i':
14851 case 'd':
14852 case 'u':
14853 case 'o':
14854 case 'x':
14855 case 'X':
14856 {
14857 int ret = mainformatlong(v, arg, p_str, writer);
14858 if (ret != 0)
14859 return ret;
14860 arg->sign = 1;
14861 break;
14862 }
14863
14864 case 'e':
14865 case 'E':
14866 case 'f':
14867 case 'F':
14868 case 'g':
14869 case 'G':
14870 if (arg->width == -1 && arg->prec == -1
14871 && !(arg->flags & (F_SIGN | F_BLANK)))
14872 {
14873 /* Fast path */
14874 if (formatfloat(v, arg, NULL, writer) == -1)
14875 return -1;
14876 return 1;
14877 }
14878
14879 arg->sign = 1;
14880 if (formatfloat(v, arg, p_str, NULL) == -1)
14881 return -1;
14882 break;
14883
14884 case 'c':
14885 {
14886 Py_UCS4 ch = formatchar(v);
14887 if (ch == (Py_UCS4) -1)
14888 return -1;
14889 if (arg->width == -1 && arg->prec == -1) {
14890 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014891 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014892 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014893 return 1;
14894 }
14895 *p_str = PyUnicode_FromOrdinal(ch);
14896 break;
14897 }
14898
14899 default:
14900 PyErr_Format(PyExc_ValueError,
14901 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014902 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014903 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14904 (int)arg->ch,
14905 ctx->fmtpos - 1);
14906 return -1;
14907 }
14908 if (*p_str == NULL)
14909 return -1;
14910 assert (PyUnicode_Check(*p_str));
14911 return 0;
14912}
14913
14914static int
14915unicode_format_arg_output(struct unicode_formatter_t *ctx,
14916 struct unicode_format_arg_t *arg,
14917 PyObject *str)
14918{
14919 Py_ssize_t len;
14920 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014921 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020014922 Py_ssize_t pindex;
14923 Py_UCS4 signchar;
14924 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014925 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014926 Py_ssize_t sublen;
14927 _PyUnicodeWriter *writer = &ctx->writer;
14928 Py_UCS4 fill;
14929
14930 fill = ' ';
14931 if (arg->sign && arg->flags & F_ZERO)
14932 fill = '0';
14933
14934 if (PyUnicode_READY(str) == -1)
14935 return -1;
14936
14937 len = PyUnicode_GET_LENGTH(str);
14938 if ((arg->width == -1 || arg->width <= len)
14939 && (arg->prec == -1 || arg->prec >= len)
14940 && !(arg->flags & (F_SIGN | F_BLANK)))
14941 {
14942 /* Fast path */
14943 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14944 return -1;
14945 return 0;
14946 }
14947
14948 /* Truncate the string for "s", "r" and "a" formats
14949 if the precision is set */
14950 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14951 if (arg->prec >= 0 && len > arg->prec)
14952 len = arg->prec;
14953 }
14954
14955 /* Adjust sign and width */
14956 kind = PyUnicode_KIND(str);
14957 pbuf = PyUnicode_DATA(str);
14958 pindex = 0;
14959 signchar = '\0';
14960 if (arg->sign) {
14961 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14962 if (ch == '-' || ch == '+') {
14963 signchar = ch;
14964 len--;
14965 pindex++;
14966 }
14967 else if (arg->flags & F_SIGN)
14968 signchar = '+';
14969 else if (arg->flags & F_BLANK)
14970 signchar = ' ';
14971 else
14972 arg->sign = 0;
14973 }
14974 if (arg->width < len)
14975 arg->width = len;
14976
14977 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014978 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014979 if (!(arg->flags & F_LJUST)) {
14980 if (arg->sign) {
14981 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014982 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014983 }
14984 else {
14985 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014986 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014987 }
14988 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014989 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14990 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014991 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014992 }
14993
Victor Stinnera47082312012-10-04 02:19:54 +020014994 buflen = arg->width;
14995 if (arg->sign && len == arg->width)
14996 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014997 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014998 return -1;
14999
15000 /* Write the sign if needed */
15001 if (arg->sign) {
15002 if (fill != ' ') {
15003 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15004 writer->pos += 1;
15005 }
15006 if (arg->width > len)
15007 arg->width--;
15008 }
15009
15010 /* Write the numeric prefix for "x", "X" and "o" formats
15011 if the alternate form is used.
15012 For example, write "0x" for the "%#x" format. */
15013 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15014 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15015 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15016 if (fill != ' ') {
15017 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15018 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15019 writer->pos += 2;
15020 pindex += 2;
15021 }
15022 arg->width -= 2;
15023 if (arg->width < 0)
15024 arg->width = 0;
15025 len -= 2;
15026 }
15027
15028 /* Pad left with the fill character if needed */
15029 if (arg->width > len && !(arg->flags & F_LJUST)) {
15030 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015031 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015032 writer->pos += sublen;
15033 arg->width = len;
15034 }
15035
15036 /* If padding with spaces: write sign if needed and/or numeric prefix if
15037 the alternate form is used */
15038 if (fill == ' ') {
15039 if (arg->sign) {
15040 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15041 writer->pos += 1;
15042 }
15043 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15044 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15045 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15046 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15047 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15048 writer->pos += 2;
15049 pindex += 2;
15050 }
15051 }
15052
15053 /* Write characters */
15054 if (len) {
15055 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15056 str, pindex, len);
15057 writer->pos += len;
15058 }
15059
15060 /* Pad right with the fill character if needed */
15061 if (arg->width > len) {
15062 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015063 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015064 writer->pos += sublen;
15065 }
15066 return 0;
15067}
15068
15069/* Helper of PyUnicode_Format(): format one arg.
15070 Return 0 on success, raise an exception and return -1 on error. */
15071static int
15072unicode_format_arg(struct unicode_formatter_t *ctx)
15073{
15074 struct unicode_format_arg_t arg;
15075 PyObject *str;
15076 int ret;
15077
Victor Stinner8dbd4212012-12-04 09:30:24 +010015078 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015079 if (arg.ch == '%') {
15080 ctx->fmtpos++;
15081 ctx->fmtcnt--;
15082 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15083 return -1;
15084 return 0;
15085 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015086 arg.flags = 0;
15087 arg.width = -1;
15088 arg.prec = -1;
15089 arg.sign = 0;
15090 str = NULL;
15091
Victor Stinnera47082312012-10-04 02:19:54 +020015092 ret = unicode_format_arg_parse(ctx, &arg);
15093 if (ret == -1)
15094 return -1;
15095
15096 ret = unicode_format_arg_format(ctx, &arg, &str);
15097 if (ret == -1)
15098 return -1;
15099
15100 if (ret != 1) {
15101 ret = unicode_format_arg_output(ctx, &arg, str);
15102 Py_DECREF(str);
15103 if (ret == -1)
15104 return -1;
15105 }
15106
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015107 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015108 PyErr_SetString(PyExc_TypeError,
15109 "not all arguments converted during string formatting");
15110 return -1;
15111 }
15112 return 0;
15113}
15114
Alexander Belopolsky40018472011-02-26 01:02:56 +000015115PyObject *
15116PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015117{
Victor Stinnera47082312012-10-04 02:19:54 +020015118 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015119
Guido van Rossumd57fd912000-03-10 22:53:23 +000015120 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015121 PyErr_BadInternalCall();
15122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015123 }
Victor Stinnera47082312012-10-04 02:19:54 +020015124
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015125 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015126 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015127
15128 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015129 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15130 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15131 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15132 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015133
Victor Stinner8f674cc2013-04-17 23:02:17 +020015134 _PyUnicodeWriter_Init(&ctx.writer);
15135 ctx.writer.min_length = ctx.fmtcnt + 100;
15136 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015137
Guido van Rossumd57fd912000-03-10 22:53:23 +000015138 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015139 ctx.arglen = PyTuple_Size(args);
15140 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015141 }
15142 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015143 ctx.arglen = -1;
15144 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015145 }
Victor Stinnera47082312012-10-04 02:19:54 +020015146 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015147 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015148 ctx.dict = args;
15149 else
15150 ctx.dict = NULL;
15151 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015152
Victor Stinnera47082312012-10-04 02:19:54 +020015153 while (--ctx.fmtcnt >= 0) {
15154 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015155 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015156
15157 nonfmtpos = ctx.fmtpos++;
15158 while (ctx.fmtcnt >= 0 &&
15159 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15160 ctx.fmtpos++;
15161 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015162 }
Victor Stinnera47082312012-10-04 02:19:54 +020015163 if (ctx.fmtcnt < 0) {
15164 ctx.fmtpos--;
15165 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015166 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015167
Victor Stinnercfc4c132013-04-03 01:48:39 +020015168 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15169 nonfmtpos, ctx.fmtpos) < 0)
15170 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015171 }
15172 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015173 ctx.fmtpos++;
15174 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015175 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015176 }
15177 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015178
Victor Stinnera47082312012-10-04 02:19:54 +020015179 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015180 PyErr_SetString(PyExc_TypeError,
15181 "not all arguments converted during string formatting");
15182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015183 }
15184
Victor Stinnera47082312012-10-04 02:19:54 +020015185 if (ctx.args_owned) {
15186 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015187 }
Victor Stinnera47082312012-10-04 02:19:54 +020015188 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015189
Benjamin Peterson29060642009-01-31 22:14:21 +000015190 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015191 _PyUnicodeWriter_Dealloc(&ctx.writer);
15192 if (ctx.args_owned) {
15193 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015194 }
15195 return NULL;
15196}
15197
Jeremy Hylton938ace62002-07-17 16:30:39 +000015198static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015199unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15200
Tim Peters6d6c1a32001-08-02 04:15:00 +000015201static PyObject *
15202unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15203{
Benjamin Peterson29060642009-01-31 22:14:21 +000015204 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015205 static char *kwlist[] = {"object", "encoding", "errors", 0};
15206 char *encoding = NULL;
15207 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015208
Benjamin Peterson14339b62009-01-31 16:36:08 +000015209 if (type != &PyUnicode_Type)
15210 return unicode_subtype_new(type, args, kwds);
15211 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015212 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 return NULL;
15214 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015215 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 if (encoding == NULL && errors == NULL)
15217 return PyObject_Str(x);
15218 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015219 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015220}
15221
Guido van Rossume023fe02001-08-30 03:12:59 +000015222static PyObject *
15223unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15224{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015225 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015226 Py_ssize_t length, char_size;
15227 int share_wstr, share_utf8;
15228 unsigned int kind;
15229 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015230
Benjamin Peterson14339b62009-01-31 16:36:08 +000015231 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015232
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015233 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015234 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015236 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015237 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015238 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015239 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015240 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015241
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015242 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015243 if (self == NULL) {
15244 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 return NULL;
15246 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015247 kind = PyUnicode_KIND(unicode);
15248 length = PyUnicode_GET_LENGTH(unicode);
15249
15250 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015251#ifdef Py_DEBUG
15252 _PyUnicode_HASH(self) = -1;
15253#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015254 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015255#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015256 _PyUnicode_STATE(self).interned = 0;
15257 _PyUnicode_STATE(self).kind = kind;
15258 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015259 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015260 _PyUnicode_STATE(self).ready = 1;
15261 _PyUnicode_WSTR(self) = NULL;
15262 _PyUnicode_UTF8_LENGTH(self) = 0;
15263 _PyUnicode_UTF8(self) = NULL;
15264 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015265 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015266
15267 share_utf8 = 0;
15268 share_wstr = 0;
15269 if (kind == PyUnicode_1BYTE_KIND) {
15270 char_size = 1;
15271 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15272 share_utf8 = 1;
15273 }
15274 else if (kind == PyUnicode_2BYTE_KIND) {
15275 char_size = 2;
15276 if (sizeof(wchar_t) == 2)
15277 share_wstr = 1;
15278 }
15279 else {
15280 assert(kind == PyUnicode_4BYTE_KIND);
15281 char_size = 4;
15282 if (sizeof(wchar_t) == 4)
15283 share_wstr = 1;
15284 }
15285
15286 /* Ensure we won't overflow the length. */
15287 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15288 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015289 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015291 data = PyObject_MALLOC((length + 1) * char_size);
15292 if (data == NULL) {
15293 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015294 goto onError;
15295 }
15296
Victor Stinnerc3c74152011-10-02 20:39:55 +020015297 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015298 if (share_utf8) {
15299 _PyUnicode_UTF8_LENGTH(self) = length;
15300 _PyUnicode_UTF8(self) = data;
15301 }
15302 if (share_wstr) {
15303 _PyUnicode_WSTR_LENGTH(self) = length;
15304 _PyUnicode_WSTR(self) = (wchar_t *)data;
15305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015306
Christian Heimesf051e432016-09-13 20:22:02 +020015307 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015308 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015309 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015310#ifdef Py_DEBUG
15311 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15312#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015313 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015314 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015315
15316onError:
15317 Py_DECREF(unicode);
15318 Py_DECREF(self);
15319 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015320}
15321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015322PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015323"str(object='') -> str\n\
15324str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015325\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015326Create a new string object from the given object. If encoding or\n\
15327errors is specified, then the object must expose a data buffer\n\
15328that will be decoded using the given encoding and error handler.\n\
15329Otherwise, returns the result of object.__str__() (if defined)\n\
15330or repr(object).\n\
15331encoding defaults to sys.getdefaultencoding().\n\
15332errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015333
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015334static PyObject *unicode_iter(PyObject *seq);
15335
Guido van Rossumd57fd912000-03-10 22:53:23 +000015336PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015337 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015338 "str", /* tp_name */
15339 sizeof(PyUnicodeObject), /* tp_basicsize */
15340 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015341 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015342 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015343 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015344 0, /* tp_getattr */
15345 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015346 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015347 unicode_repr, /* tp_repr */
15348 &unicode_as_number, /* tp_as_number */
15349 &unicode_as_sequence, /* tp_as_sequence */
15350 &unicode_as_mapping, /* tp_as_mapping */
15351 (hashfunc) unicode_hash, /* tp_hash*/
15352 0, /* tp_call*/
15353 (reprfunc) unicode_str, /* tp_str */
15354 PyObject_GenericGetAttr, /* tp_getattro */
15355 0, /* tp_setattro */
15356 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015358 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15359 unicode_doc, /* tp_doc */
15360 0, /* tp_traverse */
15361 0, /* tp_clear */
15362 PyUnicode_RichCompare, /* tp_richcompare */
15363 0, /* tp_weaklistoffset */
15364 unicode_iter, /* tp_iter */
15365 0, /* tp_iternext */
15366 unicode_methods, /* tp_methods */
15367 0, /* tp_members */
15368 0, /* tp_getset */
15369 &PyBaseObject_Type, /* tp_base */
15370 0, /* tp_dict */
15371 0, /* tp_descr_get */
15372 0, /* tp_descr_set */
15373 0, /* tp_dictoffset */
15374 0, /* tp_init */
15375 0, /* tp_alloc */
15376 unicode_new, /* tp_new */
15377 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015378};
15379
15380/* Initialize the Unicode implementation */
15381
Victor Stinner331a6a52019-05-27 16:39:22 +020015382PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015383_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015384{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015385 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015386 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015387 0x000A, /* LINE FEED */
15388 0x000D, /* CARRIAGE RETURN */
15389 0x001C, /* FILE SEPARATOR */
15390 0x001D, /* GROUP SEPARATOR */
15391 0x001E, /* RECORD SEPARATOR */
15392 0x0085, /* NEXT LINE */
15393 0x2028, /* LINE SEPARATOR */
15394 0x2029, /* PARAGRAPH SEPARATOR */
15395 };
15396
Fred Drakee4315f52000-05-09 19:53:39 +000015397 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015398 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015399 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015400 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015401 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015402 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015403
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015404 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015405 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015406 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015407
15408 /* initialize the linebreak bloom filter */
15409 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015410 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015411 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015412
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015413 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015414 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015415 }
15416 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015417 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015418 }
15419 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015420 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015421 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015422 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015423}
15424
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015425
Walter Dörwald16807132007-05-25 13:52:07 +000015426void
15427PyUnicode_InternInPlace(PyObject **p)
15428{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015429 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015431#ifdef Py_DEBUG
15432 assert(s != NULL);
15433 assert(_PyUnicode_CHECK(s));
15434#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015436 return;
15437#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015438 /* If it's a subclass, we don't really know what putting
15439 it in the interned dict might do. */
15440 if (!PyUnicode_CheckExact(s))
15441 return;
15442 if (PyUnicode_CHECK_INTERNED(s))
15443 return;
15444 if (interned == NULL) {
15445 interned = PyDict_New();
15446 if (interned == NULL) {
15447 PyErr_Clear(); /* Don't leave an exception */
15448 return;
15449 }
15450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015451 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015452 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015453 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015454 if (t == NULL) {
15455 PyErr_Clear();
15456 return;
15457 }
15458 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015459 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015460 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015461 return;
15462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015463 /* The two references in interned are not counted by refcnt.
15464 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015465 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015466 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015467}
15468
15469void
15470PyUnicode_InternImmortal(PyObject **p)
15471{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015472 PyUnicode_InternInPlace(p);
15473 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015474 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015475 Py_INCREF(*p);
15476 }
Walter Dörwald16807132007-05-25 13:52:07 +000015477}
15478
15479PyObject *
15480PyUnicode_InternFromString(const char *cp)
15481{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015482 PyObject *s = PyUnicode_FromString(cp);
15483 if (s == NULL)
15484 return NULL;
15485 PyUnicode_InternInPlace(&s);
15486 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015487}
15488
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015489
15490#if defined(WITH_VALGRIND) || defined(__INSURE__)
15491static void
15492unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015493{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015494 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015495 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015496 }
15497 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015498 if (keys == NULL || !PyList_Check(keys)) {
15499 PyErr_Clear();
15500 return;
15501 }
Walter Dörwald16807132007-05-25 13:52:07 +000015502
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015503 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 detector, interned unicode strings are not forcibly deallocated;
15505 rather, we give them their stolen references back, and then clear
15506 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015507
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015508 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015509#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015511 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015512
15513 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015514#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015515 for (Py_ssize_t i = 0; i < n; i++) {
15516 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015517 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015518 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015520 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015521 case SSTATE_INTERNED_IMMORTAL:
15522 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015523#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015524 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015525#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015526 break;
15527 case SSTATE_INTERNED_MORTAL:
15528 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015529#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015530 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015531#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015532 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015533 case SSTATE_NOT_INTERNED:
15534 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015535 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015536 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015538 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015539 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015540#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015541 fprintf(stderr, "total size of all interned strings: "
15542 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15543 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015544#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015545 Py_DECREF(keys);
15546 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015547 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015548}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015549#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015550
15551
15552/********************* Unicode Iterator **************************/
15553
15554typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015555 PyObject_HEAD
15556 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015557 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015558} unicodeiterobject;
15559
15560static void
15561unicodeiter_dealloc(unicodeiterobject *it)
15562{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015563 _PyObject_GC_UNTRACK(it);
15564 Py_XDECREF(it->it_seq);
15565 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015566}
15567
15568static int
15569unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15570{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015571 Py_VISIT(it->it_seq);
15572 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015573}
15574
15575static PyObject *
15576unicodeiter_next(unicodeiterobject *it)
15577{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015578 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015579
Benjamin Peterson14339b62009-01-31 16:36:08 +000015580 assert(it != NULL);
15581 seq = it->it_seq;
15582 if (seq == NULL)
15583 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015584 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015586 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15587 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015588 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015589 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15590 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015591 if (item != NULL)
15592 ++it->it_index;
15593 return item;
15594 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015595
Benjamin Peterson14339b62009-01-31 16:36:08 +000015596 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015597 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015598 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015599}
15600
15601static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015602unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015603{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015604 Py_ssize_t len = 0;
15605 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015606 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015607 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015608}
15609
15610PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15611
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015612static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015613unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015614{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015615 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015616 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015617 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015618 it->it_seq, it->it_index);
15619 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015620 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015621 if (u == NULL)
15622 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015623 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015624 }
15625}
15626
15627PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15628
15629static PyObject *
15630unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15631{
15632 Py_ssize_t index = PyLong_AsSsize_t(state);
15633 if (index == -1 && PyErr_Occurred())
15634 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015635 if (it->it_seq != NULL) {
15636 if (index < 0)
15637 index = 0;
15638 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15639 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15640 it->it_index = index;
15641 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015642 Py_RETURN_NONE;
15643}
15644
15645PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15646
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015647static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015648 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015649 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015650 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15651 reduce_doc},
15652 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15653 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015654 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015655};
15656
15657PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015658 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15659 "str_iterator", /* tp_name */
15660 sizeof(unicodeiterobject), /* tp_basicsize */
15661 0, /* tp_itemsize */
15662 /* methods */
15663 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015664 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015665 0, /* tp_getattr */
15666 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015667 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015668 0, /* tp_repr */
15669 0, /* tp_as_number */
15670 0, /* tp_as_sequence */
15671 0, /* tp_as_mapping */
15672 0, /* tp_hash */
15673 0, /* tp_call */
15674 0, /* tp_str */
15675 PyObject_GenericGetAttr, /* tp_getattro */
15676 0, /* tp_setattro */
15677 0, /* tp_as_buffer */
15678 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15679 0, /* tp_doc */
15680 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15681 0, /* tp_clear */
15682 0, /* tp_richcompare */
15683 0, /* tp_weaklistoffset */
15684 PyObject_SelfIter, /* tp_iter */
15685 (iternextfunc)unicodeiter_next, /* tp_iternext */
15686 unicodeiter_methods, /* tp_methods */
15687 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015688};
15689
15690static PyObject *
15691unicode_iter(PyObject *seq)
15692{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015693 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015694
Benjamin Peterson14339b62009-01-31 16:36:08 +000015695 if (!PyUnicode_Check(seq)) {
15696 PyErr_BadInternalCall();
15697 return NULL;
15698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015699 if (PyUnicode_READY(seq) == -1)
15700 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015701 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15702 if (it == NULL)
15703 return NULL;
15704 it->it_index = 0;
15705 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015706 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015707 _PyObject_GC_TRACK(it);
15708 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015709}
15710
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015711
15712size_t
15713Py_UNICODE_strlen(const Py_UNICODE *u)
15714{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015715 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015716}
15717
15718Py_UNICODE*
15719Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15720{
15721 Py_UNICODE *u = s1;
15722 while ((*u++ = *s2++));
15723 return s1;
15724}
15725
15726Py_UNICODE*
15727Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15728{
15729 Py_UNICODE *u = s1;
15730 while ((*u++ = *s2++))
15731 if (n-- == 0)
15732 break;
15733 return s1;
15734}
15735
15736Py_UNICODE*
15737Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15738{
15739 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015740 u1 += wcslen(u1);
15741 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015742 return s1;
15743}
15744
15745int
15746Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15747{
15748 while (*s1 && *s2 && *s1 == *s2)
15749 s1++, s2++;
15750 if (*s1 && *s2)
15751 return (*s1 < *s2) ? -1 : +1;
15752 if (*s1)
15753 return 1;
15754 if (*s2)
15755 return -1;
15756 return 0;
15757}
15758
15759int
15760Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15761{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015762 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015763 for (; n != 0; n--) {
15764 u1 = *s1;
15765 u2 = *s2;
15766 if (u1 != u2)
15767 return (u1 < u2) ? -1 : +1;
15768 if (u1 == '\0')
15769 return 0;
15770 s1++;
15771 s2++;
15772 }
15773 return 0;
15774}
15775
15776Py_UNICODE*
15777Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15778{
15779 const Py_UNICODE *p;
15780 for (p = s; *p; p++)
15781 if (*p == c)
15782 return (Py_UNICODE*)p;
15783 return NULL;
15784}
15785
15786Py_UNICODE*
15787Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15788{
15789 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015790 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015791 while (p != s) {
15792 p--;
15793 if (*p == c)
15794 return (Py_UNICODE*)p;
15795 }
15796 return NULL;
15797}
Victor Stinner331ea922010-08-10 16:37:20 +000015798
Victor Stinner71133ff2010-09-01 23:43:53 +000015799Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015800PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015801{
Victor Stinner577db2c2011-10-11 22:12:48 +020015802 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015803 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015805 if (!PyUnicode_Check(unicode)) {
15806 PyErr_BadArgument();
15807 return NULL;
15808 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015809 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015810 if (u == NULL)
15811 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015812 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015813 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015814 PyErr_NoMemory();
15815 return NULL;
15816 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015817 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015818 size *= sizeof(Py_UNICODE);
15819 copy = PyMem_Malloc(size);
15820 if (copy == NULL) {
15821 PyErr_NoMemory();
15822 return NULL;
15823 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015824 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015825 return copy;
15826}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015827
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015828
Victor Stinner709d23d2019-05-02 14:56:30 -040015829static int
15830encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015831{
Victor Stinner709d23d2019-05-02 14:56:30 -040015832 int res;
15833 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15834 if (res == -2) {
15835 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15836 return -1;
15837 }
15838 if (res < 0) {
15839 PyErr_NoMemory();
15840 return -1;
15841 }
15842 return 0;
15843}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015844
Victor Stinner709d23d2019-05-02 14:56:30 -040015845
15846static int
15847config_get_codec_name(wchar_t **config_encoding)
15848{
15849 char *encoding;
15850 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15851 return -1;
15852 }
15853
15854 PyObject *name_obj = NULL;
15855 PyObject *codec = _PyCodec_Lookup(encoding);
15856 PyMem_RawFree(encoding);
15857
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015858 if (!codec)
15859 goto error;
15860
15861 name_obj = PyObject_GetAttrString(codec, "name");
15862 Py_CLEAR(codec);
15863 if (!name_obj) {
15864 goto error;
15865 }
15866
Victor Stinner709d23d2019-05-02 14:56:30 -040015867 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15868 Py_DECREF(name_obj);
15869 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015870 goto error;
15871 }
15872
Victor Stinner709d23d2019-05-02 14:56:30 -040015873 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15874 if (raw_wname == NULL) {
15875 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015876 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015877 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015878 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015879
15880 PyMem_RawFree(*config_encoding);
15881 *config_encoding = raw_wname;
15882
15883 PyMem_Free(wname);
15884 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015885
15886error:
15887 Py_XDECREF(codec);
15888 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015889 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015890}
15891
15892
Victor Stinner331a6a52019-05-27 16:39:22 +020015893static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015894init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015895{
Victor Stinner709d23d2019-05-02 14:56:30 -040015896 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015897 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015898 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015899 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015900 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015901 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015902 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015903}
15904
15905
Victor Stinner709d23d2019-05-02 14:56:30 -040015906static int
15907init_fs_codec(PyInterpreterState *interp)
15908{
Victor Stinner331a6a52019-05-27 16:39:22 +020015909 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015910
15911 _Py_error_handler error_handler;
15912 error_handler = get_error_handler_wide(config->filesystem_errors);
15913 if (error_handler == _Py_ERROR_UNKNOWN) {
15914 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15915 return -1;
15916 }
15917
15918 char *encoding, *errors;
15919 if (encode_wstr_utf8(config->filesystem_encoding,
15920 &encoding,
15921 "filesystem_encoding") < 0) {
15922 return -1;
15923 }
15924
15925 if (encode_wstr_utf8(config->filesystem_errors,
15926 &errors,
15927 "filesystem_errors") < 0) {
15928 PyMem_RawFree(encoding);
15929 return -1;
15930 }
15931
15932 PyMem_RawFree(interp->fs_codec.encoding);
15933 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015934 /* encoding has been normalized by init_fs_encoding() */
15935 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015936 PyMem_RawFree(interp->fs_codec.errors);
15937 interp->fs_codec.errors = errors;
15938 interp->fs_codec.error_handler = error_handler;
15939
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015940#ifdef _Py_FORCE_UTF8_FS_ENCODING
15941 assert(interp->fs_codec.utf8 == 1);
15942#endif
15943
Victor Stinner709d23d2019-05-02 14:56:30 -040015944 /* At this point, PyUnicode_EncodeFSDefault() and
15945 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15946 the C implementation of the filesystem encoding. */
15947
15948 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15949 global configuration variables. */
15950 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15951 interp->fs_codec.errors) < 0) {
15952 PyErr_NoMemory();
15953 return -1;
15954 }
15955 return 0;
15956}
15957
15958
Victor Stinner331a6a52019-05-27 16:39:22 +020015959static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015960init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015961{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015962 PyInterpreterState *interp = tstate->interp;
15963
Victor Stinner709d23d2019-05-02 14:56:30 -040015964 /* Update the filesystem encoding to the normalized Python codec name.
15965 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15966 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015967 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015968 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015969 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015970 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015971 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015972 }
15973
Victor Stinner709d23d2019-05-02 14:56:30 -040015974 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015975 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015976 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015977 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015978}
15979
15980
Victor Stinner331a6a52019-05-27 16:39:22 +020015981PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015982_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015983{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015984 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015985 if (_PyStatus_EXCEPTION(status)) {
15986 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015987 }
15988
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015989 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015990}
15991
15992
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015993static void
15994_PyUnicode_FiniEncodings(PyThreadState *tstate)
15995{
15996 PyInterpreterState *interp = tstate->interp;
15997 PyMem_RawFree(interp->fs_codec.encoding);
15998 interp->fs_codec.encoding = NULL;
15999 interp->fs_codec.utf8 = 0;
16000 PyMem_RawFree(interp->fs_codec.errors);
16001 interp->fs_codec.errors = NULL;
16002 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
16003}
16004
16005
Victor Stinner709d23d2019-05-02 14:56:30 -040016006#ifdef MS_WINDOWS
16007int
16008_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16009{
16010 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020016011 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040016012
16013 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16014 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16015 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16016 if (encoding == NULL || errors == NULL) {
16017 PyMem_RawFree(encoding);
16018 PyMem_RawFree(errors);
16019 PyErr_NoMemory();
16020 return -1;
16021 }
16022
16023 PyMem_RawFree(config->filesystem_encoding);
16024 config->filesystem_encoding = encoding;
16025 PyMem_RawFree(config->filesystem_errors);
16026 config->filesystem_errors = errors;
16027
16028 return init_fs_codec(interp);
16029}
16030#endif
16031
16032
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016033void
Victor Stinner3d483342019-11-22 12:27:50 +010016034_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016035{
Victor Stinner3d483342019-11-22 12:27:50 +010016036 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016037#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016038 /* Insure++ is a memory analysis tool that aids in discovering
16039 * memory leaks and other memory problems. On Python exit, the
16040 * interned string dictionaries are flagged as being in use at exit
16041 * (which it is). Under normal circumstances, this is fine because
16042 * the memory will be automatically reclaimed by the system. Under
16043 * memory debugging, it's a huge source of useless noise, so we
16044 * trade off slower shutdown for less distraction in the memory
16045 * reports. -baw
16046 */
16047 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016048#endif /* __INSURE__ */
16049
Victor Stinner3d483342019-11-22 12:27:50 +010016050 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016051
Victor Stinner3d483342019-11-22 12:27:50 +010016052 for (Py_ssize_t i = 0; i < 256; i++) {
16053 Py_CLEAR(unicode_latin1[i]);
16054 }
16055 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016056 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016057
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016058 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016059}
16060
16061
Georg Brandl66c221e2010-10-14 07:04:07 +000016062/* A _string module, to export formatter_parser and formatter_field_name_split
16063 to the string.Formatter class implemented in Python. */
16064
16065static PyMethodDef _string_methods[] = {
16066 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16067 METH_O, PyDoc_STR("split the argument as a field name")},
16068 {"formatter_parser", (PyCFunction) formatter_parser,
16069 METH_O, PyDoc_STR("parse the argument as a format string")},
16070 {NULL, NULL}
16071};
16072
16073static struct PyModuleDef _string_module = {
16074 PyModuleDef_HEAD_INIT,
16075 "_string",
16076 PyDoc_STR("string helper module"),
16077 0,
16078 _string_methods,
16079 NULL,
16080 NULL,
16081 NULL,
16082 NULL
16083};
16084
16085PyMODINIT_FUNC
16086PyInit__string(void)
16087{
16088 return PyModule_Create(&_string_module);
16089}
16090
16091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016092#ifdef __cplusplus
16093}
16094#endif