blob: da17bfe01f3100d0e55a4790896dc92eda985e4b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner45876a92020-02-12 22:32:34 +010043#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020045#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010046#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020047#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040048#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010049#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000050#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600175 const from_type *_iter = (const from_type *)(begin);\
176 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200454 /* Disable checks during Python finalization. For example, it allows to
455 call _PyObject_Dump() during finalization for debugging purpose. */
456 if (interp->finalizing) {
457 return 0;
458 }
459
Victor Stinner22eb6892019-06-26 00:51:05 +0200460 if (encoding != NULL) {
461 PyObject *handler = _PyCodec_Lookup(encoding);
462 if (handler == NULL) {
463 return -1;
464 }
465 Py_DECREF(handler);
466 }
467
468 if (errors != NULL) {
469 PyObject *handler = PyCodec_LookupError(errors);
470 if (handler == NULL) {
471 return -1;
472 }
473 Py_DECREF(handler);
474 }
475 return 0;
476}
477
478
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300479/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
480 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000481Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000482PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000483{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000484#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000486#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000487 /* This is actually an illegal character, so it should
488 not be passed to unichr. */
489 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000490#endif
491}
492
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200493int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100494_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200495{
Victor Stinner68762572019-10-07 18:42:01 +0200496#define CHECK(expr) \
497 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
498
Victor Stinner910337b2011-10-03 03:20:16 +0200499 PyASCIIObject *ascii;
500 unsigned int kind;
501
Victor Stinner68762572019-10-07 18:42:01 +0200502 assert(op != NULL);
503 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200504
505 ascii = (PyASCIIObject *)op;
506 kind = ascii->state.kind;
507
Victor Stinnera3b334d2011-10-03 13:53:37 +0200508 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200509 CHECK(kind == PyUnicode_1BYTE_KIND);
510 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200511 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200512 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200513 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200514 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200515
Victor Stinnera41463c2011-10-04 01:05:08 +0200516 if (ascii->state.compact == 1) {
517 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200518 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200519 || kind == PyUnicode_2BYTE_KIND
520 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200521 CHECK(ascii->state.ascii == 0);
522 CHECK(ascii->state.ready == 1);
523 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100524 }
525 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200526 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
527
528 data = unicode->data.any;
529 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200530 CHECK(ascii->length == 0);
531 CHECK(ascii->hash == -1);
532 CHECK(ascii->state.compact == 0);
533 CHECK(ascii->state.ascii == 0);
534 CHECK(ascii->state.ready == 0);
535 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
536 CHECK(ascii->wstr != NULL);
537 CHECK(data == NULL);
538 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200539 }
540 else {
Victor Stinner68762572019-10-07 18:42:01 +0200541 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200542 || kind == PyUnicode_2BYTE_KIND
543 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200544 CHECK(ascii->state.compact == 0);
545 CHECK(ascii->state.ready == 1);
546 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200548 CHECK(compact->utf8 == data);
549 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200550 }
551 else
Victor Stinner68762572019-10-07 18:42:01 +0200552 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200553 }
554 }
555 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200556 if (
557#if SIZEOF_WCHAR_T == 2
558 kind == PyUnicode_2BYTE_KIND
559#else
560 kind == PyUnicode_4BYTE_KIND
561#endif
562 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200563 {
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(ascii->wstr == data);
565 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 } else
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200569
570 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200571 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200572 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200573 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200574 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200575
576 /* check that the best kind is used: O(n) operation */
577 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 Py_ssize_t i;
579 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 void *data;
581 Py_UCS4 ch;
582
583 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200584 for (i=0; i < ascii->length; i++)
585 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200586 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200587 if (ch > maxchar)
588 maxchar = ch;
589 }
590 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100591 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200592 CHECK(maxchar >= 128);
593 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100594 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200595 else
Victor Stinner68762572019-10-07 18:42:01 +0200596 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200597 }
Victor Stinner77faf692011-11-20 18:56:05 +0100598 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200599 CHECK(maxchar >= 0x100);
600 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100601 }
602 else {
Victor Stinner68762572019-10-07 18:42:01 +0200603 CHECK(maxchar >= 0x10000);
604 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100605 }
Victor Stinner68762572019-10-07 18:42:01 +0200606 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200607 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400608 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200609
610#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400611}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200612
Victor Stinner910337b2011-10-03 03:20:16 +0200613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614static PyObject*
615unicode_result_wchar(PyObject *unicode)
616{
617#ifndef Py_DEBUG
618 Py_ssize_t len;
619
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100620 len = _PyUnicode_WSTR_LENGTH(unicode);
621 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100622 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200623 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 }
625
626 if (len == 1) {
627 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100628 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
630 Py_DECREF(unicode);
631 return latin1_char;
632 }
633 }
634
635 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200636 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100637 return NULL;
638 }
639#else
Victor Stinneraa771272012-10-04 02:32:58 +0200640 assert(Py_REFCNT(unicode) == 1);
641
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100642 /* don't make the result ready in debug mode to ensure that the caller
643 makes the string ready before using it */
644 assert(_PyUnicode_CheckConsistency(unicode, 1));
645#endif
646 return unicode;
647}
648
649static PyObject*
650unicode_result_ready(PyObject *unicode)
651{
652 Py_ssize_t length;
653
654 length = PyUnicode_GET_LENGTH(unicode);
655 if (length == 0) {
656 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100657 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200658 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100659 }
660 return unicode_empty;
661 }
662
663 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200664 void *data = PyUnicode_DATA(unicode);
665 int kind = PyUnicode_KIND(unicode);
666 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100667 if (ch < 256) {
668 PyObject *latin1_char = unicode_latin1[ch];
669 if (latin1_char != NULL) {
670 if (unicode != latin1_char) {
671 Py_INCREF(latin1_char);
672 Py_DECREF(unicode);
673 }
674 return latin1_char;
675 }
676 else {
677 assert(_PyUnicode_CheckConsistency(unicode, 1));
678 Py_INCREF(unicode);
679 unicode_latin1[ch] = unicode;
680 return unicode;
681 }
682 }
683 }
684
685 assert(_PyUnicode_CheckConsistency(unicode, 1));
686 return unicode;
687}
688
689static PyObject*
690unicode_result(PyObject *unicode)
691{
692 assert(_PyUnicode_CHECK(unicode));
693 if (PyUnicode_IS_READY(unicode))
694 return unicode_result_ready(unicode);
695 else
696 return unicode_result_wchar(unicode);
697}
698
Victor Stinnerc4b49542011-12-11 22:44:26 +0100699static PyObject*
700unicode_result_unchanged(PyObject *unicode)
701{
702 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500703 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100704 return NULL;
705 Py_INCREF(unicode);
706 return unicode;
707 }
708 else
709 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100710 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100711}
712
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200713/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
714 ASCII, Latin1, UTF-8, etc. */
715static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200716backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200717 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
718{
Victor Stinnerad771582015-10-09 12:38:53 +0200719 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200720 Py_UCS4 ch;
721 enum PyUnicode_Kind kind;
722 void *data;
723
724 assert(PyUnicode_IS_READY(unicode));
725 kind = PyUnicode_KIND(unicode);
726 data = PyUnicode_DATA(unicode);
727
728 size = 0;
729 /* determine replacement size */
730 for (i = collstart; i < collend; ++i) {
731 Py_ssize_t incr;
732
733 ch = PyUnicode_READ(kind, data, i);
734 if (ch < 0x100)
735 incr = 2+2;
736 else if (ch < 0x10000)
737 incr = 2+4;
738 else {
739 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200740 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200741 }
742 if (size > PY_SSIZE_T_MAX - incr) {
743 PyErr_SetString(PyExc_OverflowError,
744 "encoded result is too long for a Python string");
745 return NULL;
746 }
747 size += incr;
748 }
749
Victor Stinnerad771582015-10-09 12:38:53 +0200750 str = _PyBytesWriter_Prepare(writer, str, size);
751 if (str == NULL)
752 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200753
754 /* generate replacement */
755 for (i = collstart; i < collend; ++i) {
756 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200757 *str++ = '\\';
758 if (ch >= 0x00010000) {
759 *str++ = 'U';
760 *str++ = Py_hexdigits[(ch>>28)&0xf];
761 *str++ = Py_hexdigits[(ch>>24)&0xf];
762 *str++ = Py_hexdigits[(ch>>20)&0xf];
763 *str++ = Py_hexdigits[(ch>>16)&0xf];
764 *str++ = Py_hexdigits[(ch>>12)&0xf];
765 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200766 }
Victor Stinner797485e2015-10-09 03:17:30 +0200767 else if (ch >= 0x100) {
768 *str++ = 'u';
769 *str++ = Py_hexdigits[(ch>>12)&0xf];
770 *str++ = Py_hexdigits[(ch>>8)&0xf];
771 }
772 else
773 *str++ = 'x';
774 *str++ = Py_hexdigits[(ch>>4)&0xf];
775 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200776 }
777 return str;
778}
779
780/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
781 ASCII, Latin1, UTF-8, etc. */
782static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200783xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
785{
Victor Stinnerad771582015-10-09 12:38:53 +0200786 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200787 Py_UCS4 ch;
788 enum PyUnicode_Kind kind;
789 void *data;
790
791 assert(PyUnicode_IS_READY(unicode));
792 kind = PyUnicode_KIND(unicode);
793 data = PyUnicode_DATA(unicode);
794
795 size = 0;
796 /* determine replacement size */
797 for (i = collstart; i < collend; ++i) {
798 Py_ssize_t incr;
799
800 ch = PyUnicode_READ(kind, data, i);
801 if (ch < 10)
802 incr = 2+1+1;
803 else if (ch < 100)
804 incr = 2+2+1;
805 else if (ch < 1000)
806 incr = 2+3+1;
807 else if (ch < 10000)
808 incr = 2+4+1;
809 else if (ch < 100000)
810 incr = 2+5+1;
811 else if (ch < 1000000)
812 incr = 2+6+1;
813 else {
814 assert(ch <= MAX_UNICODE);
815 incr = 2+7+1;
816 }
817 if (size > PY_SSIZE_T_MAX - incr) {
818 PyErr_SetString(PyExc_OverflowError,
819 "encoded result is too long for a Python string");
820 return NULL;
821 }
822 size += incr;
823 }
824
Victor Stinnerad771582015-10-09 12:38:53 +0200825 str = _PyBytesWriter_Prepare(writer, str, size);
826 if (str == NULL)
827 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200828
829 /* generate replacement */
830 for (i = collstart; i < collend; ++i) {
831 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
832 }
833 return str;
834}
835
Thomas Wouters477c8d52006-05-27 19:21:47 +0000836/* --- Bloom Filters ----------------------------------------------------- */
837
838/* stuff to implement simple "bloom filters" for Unicode characters.
839 to keep things simple, we use a single bitmask, using the least 5
840 bits from each unicode characters as the bit index. */
841
842/* the linebreak mask is set up by Unicode_Init below */
843
Antoine Pitrouf068f942010-01-13 14:19:12 +0000844#if LONG_BIT >= 128
845#define BLOOM_WIDTH 128
846#elif LONG_BIT >= 64
847#define BLOOM_WIDTH 64
848#elif LONG_BIT >= 32
849#define BLOOM_WIDTH 32
850#else
851#error "LONG_BIT is smaller than 32"
852#endif
853
Thomas Wouters477c8d52006-05-27 19:21:47 +0000854#define BLOOM_MASK unsigned long
855
Serhiy Storchaka05997252013-01-26 12:14:02 +0200856static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Antoine Pitrouf068f942010-01-13 14:19:12 +0000858#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000859
Benjamin Peterson29060642009-01-31 22:14:21 +0000860#define BLOOM_LINEBREAK(ch) \
861 ((ch) < 128U ? ascii_linebreak[(ch)] : \
862 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000863
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700864static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000866{
Victor Stinnera85af502013-04-09 21:53:54 +0200867#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
868 do { \
869 TYPE *data = (TYPE *)PTR; \
870 TYPE *end = data + LEN; \
871 Py_UCS4 ch; \
872 for (; data != end; data++) { \
873 ch = *data; \
874 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
875 } \
876 break; \
877 } while (0)
878
Thomas Wouters477c8d52006-05-27 19:21:47 +0000879 /* calculate simple bloom-style bitmask for a given unicode string */
880
Antoine Pitrouf068f942010-01-13 14:19:12 +0000881 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882
883 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200884 switch (kind) {
885 case PyUnicode_1BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
887 break;
888 case PyUnicode_2BYTE_KIND:
889 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
890 break;
891 case PyUnicode_4BYTE_KIND:
892 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
893 break;
894 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700895 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200896 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000897 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200898
899#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900}
901
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300902static int
903ensure_unicode(PyObject *obj)
904{
905 if (!PyUnicode_Check(obj)) {
906 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200907 "must be str, not %.100s",
908 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300909 return -1;
910 }
911 return PyUnicode_READY(obj);
912}
913
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200914/* Compilation of templated routines */
915
916#include "stringlib/asciilib.h"
917#include "stringlib/fastsearch.h"
918#include "stringlib/partition.h"
919#include "stringlib/split.h"
920#include "stringlib/count.h"
921#include "stringlib/find.h"
922#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200923#include "stringlib/undef.h"
924
925#include "stringlib/ucs1lib.h"
926#include "stringlib/fastsearch.h"
927#include "stringlib/partition.h"
928#include "stringlib/split.h"
929#include "stringlib/count.h"
930#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300931#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/undef.h"
934
935#include "stringlib/ucs2lib.h"
936#include "stringlib/fastsearch.h"
937#include "stringlib/partition.h"
938#include "stringlib/split.h"
939#include "stringlib/count.h"
940#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300941#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200942#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/undef.h"
944
945#include "stringlib/ucs4lib.h"
946#include "stringlib/fastsearch.h"
947#include "stringlib/partition.h"
948#include "stringlib/split.h"
949#include "stringlib/count.h"
950#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300951#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200952#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200953#include "stringlib/undef.h"
954
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200955#include "stringlib/unicodedefs.h"
956#include "stringlib/fastsearch.h"
957#include "stringlib/count.h"
958#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100959#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200960
Guido van Rossumd57fd912000-03-10 22:53:23 +0000961/* --- Unicode Object ----------------------------------------------------- */
962
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700963static inline Py_ssize_t
964findchar(const void *s, int kind,
965 Py_ssize_t size, Py_UCS4 ch,
966 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200968 switch (kind) {
969 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200970 if ((Py_UCS1) ch != ch)
971 return -1;
972 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600973 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200974 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600975 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200977 if ((Py_UCS2) ch != ch)
978 return -1;
979 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600980 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200981 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600982 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200983 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200984 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600985 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200986 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600987 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200988 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700989 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200991}
992
Victor Stinnerafffce42012-10-03 23:03:17 +0200993#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000994/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200995 earlier.
996
997 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
998 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
999 invalid character in Unicode 6.0. */
1000static void
1001unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1002{
1003 int kind = PyUnicode_KIND(unicode);
1004 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1005 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1006 if (length <= old_length)
1007 return;
1008 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1009}
1010#endif
1011
Victor Stinnerfe226c02011-10-03 03:52:20 +02001012static PyObject*
1013resize_compact(PyObject *unicode, Py_ssize_t length)
1014{
1015 Py_ssize_t char_size;
1016 Py_ssize_t struct_size;
1017 Py_ssize_t new_size;
1018 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001019 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001020#ifdef Py_DEBUG
1021 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1022#endif
1023
Victor Stinner79891572012-05-03 13:43:07 +02001024 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001026 assert(PyUnicode_IS_COMPACT(unicode));
1027
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001028 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001029 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 struct_size = sizeof(PyASCIIObject);
1031 else
1032 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001033 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1036 PyErr_NoMemory();
1037 return NULL;
1038 }
1039 new_size = (struct_size + (length + 1) * char_size);
1040
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001041 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1042 PyObject_DEL(_PyUnicode_UTF8(unicode));
1043 _PyUnicode_UTF8(unicode) = NULL;
1044 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1045 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001046#ifdef Py_REF_DEBUG
1047 _Py_RefTotal--;
1048#endif
1049#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001050 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001051#endif
Victor Stinner84def372011-12-11 20:04:56 +01001052
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001053 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001054 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001055 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 PyErr_NoMemory();
1057 return NULL;
1058 }
Victor Stinner84def372011-12-11 20:04:56 +01001059 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001060 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001061
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001063 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001064 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001065 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001066 _PyUnicode_WSTR_LENGTH(unicode) = length;
1067 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001068 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1069 PyObject_DEL(_PyUnicode_WSTR(unicode));
1070 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001071 if (!PyUnicode_IS_ASCII(unicode))
1072 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001073 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001074#ifdef Py_DEBUG
1075 unicode_fill_invalid(unicode, old_length);
1076#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1078 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001079 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 return unicode;
1081}
1082
Alexander Belopolsky40018472011-02-26 01:02:56 +00001083static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001084resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001085{
Victor Stinner95663112011-10-04 01:03:50 +02001086 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001087 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001089 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001090
Victor Stinnerfe226c02011-10-03 03:52:20 +02001091 if (PyUnicode_IS_READY(unicode)) {
1092 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001093 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001094 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001095#ifdef Py_DEBUG
1096 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1097#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001098
1099 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001100 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001101 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1102 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001103
1104 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1105 PyErr_NoMemory();
1106 return -1;
1107 }
1108 new_size = (length + 1) * char_size;
1109
Victor Stinner7a9105a2011-12-12 00:13:42 +01001110 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1111 {
1112 PyObject_DEL(_PyUnicode_UTF8(unicode));
1113 _PyUnicode_UTF8(unicode) = NULL;
1114 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1115 }
1116
Victor Stinnerfe226c02011-10-03 03:52:20 +02001117 data = (PyObject *)PyObject_REALLOC(data, new_size);
1118 if (data == NULL) {
1119 PyErr_NoMemory();
1120 return -1;
1121 }
1122 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001124 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001125 _PyUnicode_WSTR_LENGTH(unicode) = length;
1126 }
1127 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001128 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001129 _PyUnicode_UTF8_LENGTH(unicode) = length;
1130 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001131 _PyUnicode_LENGTH(unicode) = length;
1132 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001133#ifdef Py_DEBUG
1134 unicode_fill_invalid(unicode, old_length);
1135#endif
Victor Stinner95663112011-10-04 01:03:50 +02001136 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001137 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001138 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001140 }
Victor Stinner95663112011-10-04 01:03:50 +02001141 assert(_PyUnicode_WSTR(unicode) != NULL);
1142
1143 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001144 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001145 PyErr_NoMemory();
1146 return -1;
1147 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001148 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001149 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001150 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001151 if (!wstr) {
1152 PyErr_NoMemory();
1153 return -1;
1154 }
1155 _PyUnicode_WSTR(unicode) = wstr;
1156 _PyUnicode_WSTR(unicode)[length] = 0;
1157 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001158 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 return 0;
1160}
1161
Victor Stinnerfe226c02011-10-03 03:52:20 +02001162static PyObject*
1163resize_copy(PyObject *unicode, Py_ssize_t length)
1164{
1165 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001166 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001167 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001168
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001169 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001170
1171 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1172 if (copy == NULL)
1173 return NULL;
1174
1175 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001176 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001178 }
1179 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001180 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001181
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001182 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001183 if (w == NULL)
1184 return NULL;
1185 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1186 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001187 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001188 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001189 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001190 }
1191}
1192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001194 Ux0000 terminated; some code (e.g. new_identifier)
1195 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196
1197 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001198 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199
1200*/
1201
Alexander Belopolsky40018472011-02-26 01:02:56 +00001202static PyUnicodeObject *
1203_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001205 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207
Thomas Wouters477c8d52006-05-27 19:21:47 +00001208 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 if (length == 0 && unicode_empty != NULL) {
1210 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001211 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212 }
1213
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001214 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001215 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001216 return (PyUnicodeObject *)PyErr_NoMemory();
1217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 if (length < 0) {
1219 PyErr_SetString(PyExc_SystemError,
1220 "Negative size passed to _PyUnicode_New");
1221 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222 }
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1225 if (unicode == NULL)
1226 return NULL;
1227 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001228
1229 _PyUnicode_WSTR_LENGTH(unicode) = length;
1230 _PyUnicode_HASH(unicode) = -1;
1231 _PyUnicode_STATE(unicode).interned = 0;
1232 _PyUnicode_STATE(unicode).kind = 0;
1233 _PyUnicode_STATE(unicode).compact = 0;
1234 _PyUnicode_STATE(unicode).ready = 0;
1235 _PyUnicode_STATE(unicode).ascii = 0;
1236 _PyUnicode_DATA_ANY(unicode) = NULL;
1237 _PyUnicode_LENGTH(unicode) = 0;
1238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1242 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001243 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001245 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247
Jeremy Hyltond8082792003-09-16 19:41:39 +00001248 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001249 * the caller fails before initializing str -- unicode_resize()
1250 * reads str[0], and the Keep-Alive optimization can keep memory
1251 * allocated for str alive across a call to unicode_dealloc(unicode).
1252 * We don't want unicode_resize to read uninitialized memory in
1253 * that case.
1254 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 _PyUnicode_WSTR(unicode)[0] = 0;
1256 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001257
Victor Stinner7931d9a2011-11-04 00:22:48 +01001258 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259 return unicode;
1260}
1261
Victor Stinnerf42dc442011-10-02 23:33:16 +02001262static const char*
1263unicode_kind_name(PyObject *unicode)
1264{
Victor Stinner42dfd712011-10-03 14:41:45 +02001265 /* don't check consistency: unicode_kind_name() is called from
1266 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001267 if (!PyUnicode_IS_COMPACT(unicode))
1268 {
1269 if (!PyUnicode_IS_READY(unicode))
1270 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001271 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001272 {
1273 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001274 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001275 return "legacy ascii";
1276 else
1277 return "legacy latin1";
1278 case PyUnicode_2BYTE_KIND:
1279 return "legacy UCS2";
1280 case PyUnicode_4BYTE_KIND:
1281 return "legacy UCS4";
1282 default:
1283 return "<legacy invalid kind>";
1284 }
1285 }
1286 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001287 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001288 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001289 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001290 return "ascii";
1291 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001292 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001294 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001295 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001296 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 default:
1298 return "<invalid compact kind>";
1299 }
1300}
1301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001304char *_PyUnicode_utf8(void *unicode_raw){
1305 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001306 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307}
1308
Victor Stinnera42de742018-11-22 10:25:22 +01001309void *_PyUnicode_compact_data(void *unicode_raw) {
1310 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 return _PyUnicode_COMPACT_DATA(unicode);
1312}
Victor Stinnera42de742018-11-22 10:25:22 +01001313void *_PyUnicode_data(void *unicode_raw) {
1314 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001315 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1317 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1318 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1319 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1320 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1321 return PyUnicode_DATA(unicode);
1322}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001323
1324void
1325_PyUnicode_Dump(PyObject *op)
1326{
1327 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001328 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1330 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001331
Victor Stinnera849a4b2011-10-03 12:12:11 +02001332 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001333 {
1334 if (ascii->state.ascii)
1335 data = (ascii + 1);
1336 else
1337 data = (compact + 1);
1338 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001339 else
1340 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001341 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1342 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001343
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 if (ascii->wstr == data)
1345 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001346 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001347
Victor Stinnera3b334d2011-10-03 13:53:37 +02001348 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001349 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001350 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1351 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001352 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001353 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001354 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001355 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001356}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357#endif
1358
1359PyObject *
1360PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1361{
1362 PyObject *obj;
1363 PyCompactUnicodeObject *unicode;
1364 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001365 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001366 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 Py_ssize_t char_size;
1368 Py_ssize_t struct_size;
1369
1370 /* Optimization for empty strings */
1371 if (size == 0 && unicode_empty != NULL) {
1372 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001373 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 }
1375
Victor Stinner9e9d6892011-10-04 01:02:02 +02001376 is_ascii = 0;
1377 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 struct_size = sizeof(PyCompactUnicodeObject);
1379 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 1;
1382 is_ascii = 1;
1383 struct_size = sizeof(PyASCIIObject);
1384 }
1385 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001386 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 char_size = 1;
1388 }
1389 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001390 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 char_size = 2;
1392 if (sizeof(wchar_t) == 2)
1393 is_sharing = 1;
1394 }
1395 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001396 if (maxchar > MAX_UNICODE) {
1397 PyErr_SetString(PyExc_SystemError,
1398 "invalid maximum character passed to PyUnicode_New");
1399 return NULL;
1400 }
Victor Stinner8f825062012-04-27 13:55:39 +02001401 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 char_size = 4;
1403 if (sizeof(wchar_t) == 4)
1404 is_sharing = 1;
1405 }
1406
1407 /* Ensure we won't overflow the size. */
1408 if (size < 0) {
1409 PyErr_SetString(PyExc_SystemError,
1410 "Negative size passed to PyUnicode_New");
1411 return NULL;
1412 }
1413 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1414 return PyErr_NoMemory();
1415
1416 /* Duplicated allocation code from _PyObject_New() instead of a call to
1417 * PyObject_New() so we are able to allocate space for the object and
1418 * it's data buffer.
1419 */
1420 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1421 if (obj == NULL)
1422 return PyErr_NoMemory();
1423 obj = PyObject_INIT(obj, &PyUnicode_Type);
1424 if (obj == NULL)
1425 return NULL;
1426
1427 unicode = (PyCompactUnicodeObject *)obj;
1428 if (is_ascii)
1429 data = ((PyASCIIObject*)obj) + 1;
1430 else
1431 data = unicode + 1;
1432 _PyUnicode_LENGTH(unicode) = size;
1433 _PyUnicode_HASH(unicode) = -1;
1434 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001435 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 _PyUnicode_STATE(unicode).compact = 1;
1437 _PyUnicode_STATE(unicode).ready = 1;
1438 _PyUnicode_STATE(unicode).ascii = is_ascii;
1439 if (is_ascii) {
1440 ((char*)data)[size] = 0;
1441 _PyUnicode_WSTR(unicode) = NULL;
1442 }
Victor Stinner8f825062012-04-27 13:55:39 +02001443 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 ((char*)data)[size] = 0;
1445 _PyUnicode_WSTR(unicode) = NULL;
1446 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001448 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 else {
1451 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001452 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001453 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001454 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001455 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 ((Py_UCS4*)data)[size] = 0;
1457 if (is_sharing) {
1458 _PyUnicode_WSTR_LENGTH(unicode) = size;
1459 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1460 }
1461 else {
1462 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 }
1465 }
Victor Stinner8f825062012-04-27 13:55:39 +02001466#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001467 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001468#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001469 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 return obj;
1471}
1472
1473#if SIZEOF_WCHAR_T == 2
1474/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1475 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001476 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477
1478 This function assumes that unicode can hold one more code point than wstr
1479 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001480static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001482 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483{
1484 const wchar_t *iter;
1485 Py_UCS4 *ucs4_out;
1486
Victor Stinner910337b2011-10-03 03:20:16 +02001487 assert(unicode != NULL);
1488 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1490 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1491
1492 for (iter = begin; iter < end; ) {
1493 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1494 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001495 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1496 && (iter+1) < end
1497 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 {
Victor Stinner551ac952011-11-29 22:58:13 +01001499 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 iter += 2;
1501 }
1502 else {
1503 *ucs4_out++ = *iter;
1504 iter++;
1505 }
1506 }
1507 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1508 _PyUnicode_GET_LENGTH(unicode)));
1509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510}
1511#endif
1512
Victor Stinnercd9950f2011-10-02 00:34:53 +02001513static int
Victor Stinner488fa492011-12-12 00:01:39 +01001514unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001515{
Victor Stinner488fa492011-12-12 00:01:39 +01001516 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001517 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001518 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001519 return -1;
1520 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001521 return 0;
1522}
1523
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524static int
1525_copy_characters(PyObject *to, Py_ssize_t to_start,
1526 PyObject *from, Py_ssize_t from_start,
1527 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001529 unsigned int from_kind, to_kind;
1530 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531
Victor Stinneree4544c2012-05-09 22:24:08 +02001532 assert(0 <= how_many);
1533 assert(0 <= from_start);
1534 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001535 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001536 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001537 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538
Victor Stinnerd3f08822012-05-29 12:57:52 +02001539 assert(PyUnicode_Check(to));
1540 assert(PyUnicode_IS_READY(to));
1541 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1542
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001543 if (how_many == 0)
1544 return 0;
1545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001546 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001547 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001549 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550
Victor Stinnerf1852262012-06-16 16:38:26 +02001551#ifdef Py_DEBUG
1552 if (!check_maxchar
1553 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1554 {
1555 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1556 Py_UCS4 ch;
1557 Py_ssize_t i;
1558 for (i=0; i < how_many; i++) {
1559 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1560 assert(ch <= to_maxchar);
1561 }
1562 }
1563#endif
1564
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001565 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001566 if (check_maxchar
1567 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1568 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001569 /* Writing Latin-1 characters into an ASCII string requires to
1570 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001571 Py_UCS4 max_char;
1572 max_char = ucs1lib_find_max_char(from_data,
1573 (Py_UCS1*)from_data + how_many);
1574 if (max_char >= 128)
1575 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001576 }
Christian Heimesf051e432016-09-13 20:22:02 +02001577 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001578 (char*)from_data + from_kind * from_start,
1579 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001581 else if (from_kind == PyUnicode_1BYTE_KIND
1582 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001583 {
1584 _PyUnicode_CONVERT_BYTES(
1585 Py_UCS1, Py_UCS2,
1586 PyUnicode_1BYTE_DATA(from) + from_start,
1587 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1588 PyUnicode_2BYTE_DATA(to) + to_start
1589 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001590 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001591 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001592 && to_kind == PyUnicode_4BYTE_KIND)
1593 {
1594 _PyUnicode_CONVERT_BYTES(
1595 Py_UCS1, Py_UCS4,
1596 PyUnicode_1BYTE_DATA(from) + from_start,
1597 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1598 PyUnicode_4BYTE_DATA(to) + to_start
1599 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001600 }
1601 else if (from_kind == PyUnicode_2BYTE_KIND
1602 && to_kind == PyUnicode_4BYTE_KIND)
1603 {
1604 _PyUnicode_CONVERT_BYTES(
1605 Py_UCS2, Py_UCS4,
1606 PyUnicode_2BYTE_DATA(from) + from_start,
1607 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1608 PyUnicode_4BYTE_DATA(to) + to_start
1609 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001610 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001611 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001612 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1613
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001614 if (!check_maxchar) {
1615 if (from_kind == PyUnicode_2BYTE_KIND
1616 && to_kind == PyUnicode_1BYTE_KIND)
1617 {
1618 _PyUnicode_CONVERT_BYTES(
1619 Py_UCS2, Py_UCS1,
1620 PyUnicode_2BYTE_DATA(from) + from_start,
1621 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1622 PyUnicode_1BYTE_DATA(to) + to_start
1623 );
1624 }
1625 else if (from_kind == PyUnicode_4BYTE_KIND
1626 && to_kind == PyUnicode_1BYTE_KIND)
1627 {
1628 _PyUnicode_CONVERT_BYTES(
1629 Py_UCS4, Py_UCS1,
1630 PyUnicode_4BYTE_DATA(from) + from_start,
1631 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1632 PyUnicode_1BYTE_DATA(to) + to_start
1633 );
1634 }
1635 else if (from_kind == PyUnicode_4BYTE_KIND
1636 && to_kind == PyUnicode_2BYTE_KIND)
1637 {
1638 _PyUnicode_CONVERT_BYTES(
1639 Py_UCS4, Py_UCS2,
1640 PyUnicode_4BYTE_DATA(from) + from_start,
1641 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1642 PyUnicode_2BYTE_DATA(to) + to_start
1643 );
1644 }
1645 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001646 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001647 }
1648 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001649 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001650 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001651 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 Py_ssize_t i;
1653
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 for (i=0; i < how_many; i++) {
1655 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001656 if (ch > to_maxchar)
1657 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001658 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1659 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001660 }
1661 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001662 return 0;
1663}
1664
Victor Stinnerd3f08822012-05-29 12:57:52 +02001665void
1666_PyUnicode_FastCopyCharacters(
1667 PyObject *to, Py_ssize_t to_start,
1668 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001669{
1670 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1671}
1672
1673Py_ssize_t
1674PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1675 PyObject *from, Py_ssize_t from_start,
1676 Py_ssize_t how_many)
1677{
1678 int err;
1679
1680 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1681 PyErr_BadInternalCall();
1682 return -1;
1683 }
1684
Benjamin Petersonbac79492012-01-14 13:34:47 -05001685 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001687 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001688 return -1;
1689
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001690 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001691 PyErr_SetString(PyExc_IndexError, "string index out of range");
1692 return -1;
1693 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001694 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001695 PyErr_SetString(PyExc_IndexError, "string index out of range");
1696 return -1;
1697 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001698 if (how_many < 0) {
1699 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1700 return -1;
1701 }
1702 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001703 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1704 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001705 "Cannot write %zi characters at %zi "
1706 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001707 how_many, to_start, PyUnicode_GET_LENGTH(to));
1708 return -1;
1709 }
1710
1711 if (how_many == 0)
1712 return 0;
1713
Victor Stinner488fa492011-12-12 00:01:39 +01001714 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001715 return -1;
1716
1717 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1718 if (err) {
1719 PyErr_Format(PyExc_SystemError,
1720 "Cannot copy %s characters "
1721 "into a string of %s characters",
1722 unicode_kind_name(from),
1723 unicode_kind_name(to));
1724 return -1;
1725 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001726 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727}
1728
Victor Stinner17222162011-09-28 22:15:37 +02001729/* Find the maximum code point and count the number of surrogate pairs so a
1730 correct string length can be computed before converting a string to UCS4.
1731 This function counts single surrogates as a character and not as a pair.
1732
1733 Return 0 on success, or -1 on error. */
1734static int
1735find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1736 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737{
1738 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001739 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740
Victor Stinnerc53be962011-10-02 21:33:54 +02001741 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001742 *num_surrogates = 0;
1743 *maxchar = 0;
1744
1745 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001747 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1748 && (iter+1) < end
1749 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1750 {
1751 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1752 ++(*num_surrogates);
1753 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 }
1755 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001757 {
1758 ch = *iter;
1759 iter++;
1760 }
1761 if (ch > *maxchar) {
1762 *maxchar = ch;
1763 if (*maxchar > MAX_UNICODE) {
1764 PyErr_Format(PyExc_ValueError,
1765 "character U+%x is not in range [U+0000; U+10ffff]",
1766 ch);
1767 return -1;
1768 }
1769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 }
1771 return 0;
1772}
1773
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001774int
1775_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776{
1777 wchar_t *end;
1778 Py_UCS4 maxchar = 0;
1779 Py_ssize_t num_surrogates;
1780#if SIZEOF_WCHAR_T == 2
1781 Py_ssize_t length_wo_surrogates;
1782#endif
1783
Georg Brandl7597add2011-10-05 16:36:47 +02001784 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001785 strings were created using _PyObject_New() and where no canonical
1786 representation (the str field) has been set yet aka strings
1787 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001788 assert(_PyUnicode_CHECK(unicode));
1789 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001792 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001793 /* Actually, it should neither be interned nor be anything else: */
1794 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001797 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001798 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800
1801 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001802 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1803 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 PyErr_NoMemory();
1805 return -1;
1806 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001807 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 _PyUnicode_WSTR(unicode), end,
1809 PyUnicode_1BYTE_DATA(unicode));
1810 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1811 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1812 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1813 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001814 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001815 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001816 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 }
1818 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001819 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001820 _PyUnicode_UTF8(unicode) = NULL;
1821 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 }
1823 PyObject_FREE(_PyUnicode_WSTR(unicode));
1824 _PyUnicode_WSTR(unicode) = NULL;
1825 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1826 }
1827 /* In this case we might have to convert down from 4-byte native
1828 wchar_t to 2-byte unicode. */
1829 else if (maxchar < 65536) {
1830 assert(num_surrogates == 0 &&
1831 "FindMaxCharAndNumSurrogatePairs() messed up");
1832
Victor Stinner506f5922011-09-28 22:34:18 +02001833#if SIZEOF_WCHAR_T == 2
1834 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001835 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001836 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1837 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1838 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001839 _PyUnicode_UTF8(unicode) = NULL;
1840 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001841#else
1842 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001843 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001844 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001845 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001846 PyErr_NoMemory();
1847 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 }
Victor Stinner506f5922011-09-28 22:34:18 +02001849 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1850 _PyUnicode_WSTR(unicode), end,
1851 PyUnicode_2BYTE_DATA(unicode));
1852 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1853 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1854 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001855 _PyUnicode_UTF8(unicode) = NULL;
1856 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001857 PyObject_FREE(_PyUnicode_WSTR(unicode));
1858 _PyUnicode_WSTR(unicode) = NULL;
1859 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1860#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 }
1862 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1863 else {
1864#if SIZEOF_WCHAR_T == 2
1865 /* in case the native representation is 2-bytes, we need to allocate a
1866 new normalized 4-byte version. */
1867 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001868 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1869 PyErr_NoMemory();
1870 return -1;
1871 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001872 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1873 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 PyErr_NoMemory();
1875 return -1;
1876 }
1877 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1878 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001879 _PyUnicode_UTF8(unicode) = NULL;
1880 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001881 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1882 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001883 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 PyObject_FREE(_PyUnicode_WSTR(unicode));
1885 _PyUnicode_WSTR(unicode) = NULL;
1886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1887#else
1888 assert(num_surrogates == 0);
1889
Victor Stinnerc3c74152011-10-02 20:39:55 +02001890 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001892 _PyUnicode_UTF8(unicode) = NULL;
1893 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1895#endif
1896 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1897 }
1898 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001899 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 return 0;
1901}
1902
Alexander Belopolsky40018472011-02-26 01:02:56 +00001903static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001904unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001905{
Walter Dörwald16807132007-05-25 13:52:07 +00001906 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 case SSTATE_NOT_INTERNED:
1908 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001909
Benjamin Peterson29060642009-01-31 22:14:21 +00001910 case SSTATE_INTERNED_MORTAL:
1911 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001912 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001913 if (PyDict_DelItem(interned, unicode) != 0) {
1914 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1915 NULL);
1916 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001918
Benjamin Peterson29060642009-01-31 22:14:21 +00001919 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001920 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1921 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001922
Benjamin Peterson29060642009-01-31 22:14:21 +00001923 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001924 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001925 }
1926
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001927 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001928 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 }
1930 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001931 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001932 }
1933 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001934 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001937 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938}
1939
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001940#ifdef Py_DEBUG
1941static int
1942unicode_is_singleton(PyObject *unicode)
1943{
1944 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1945 if (unicode == unicode_empty)
1946 return 1;
1947 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1948 {
1949 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1950 if (ch < 256 && unicode_latin1[ch] == unicode)
1951 return 1;
1952 }
1953 return 0;
1954}
1955#endif
1956
Alexander Belopolsky40018472011-02-26 01:02:56 +00001957static int
Victor Stinner488fa492011-12-12 00:01:39 +01001958unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001959{
Victor Stinner488fa492011-12-12 00:01:39 +01001960 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001961 if (Py_REFCNT(unicode) != 1)
1962 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001963 if (_PyUnicode_HASH(unicode) != -1)
1964 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001965 if (PyUnicode_CHECK_INTERNED(unicode))
1966 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001967 if (!PyUnicode_CheckExact(unicode))
1968 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001969#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001970 /* singleton refcount is greater than 1 */
1971 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001972#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001973 return 1;
1974}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001975
Victor Stinnerfe226c02011-10-03 03:52:20 +02001976static int
1977unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1978{
1979 PyObject *unicode;
1980 Py_ssize_t old_length;
1981
1982 assert(p_unicode != NULL);
1983 unicode = *p_unicode;
1984
1985 assert(unicode != NULL);
1986 assert(PyUnicode_Check(unicode));
1987 assert(0 <= length);
1988
Victor Stinner910337b2011-10-03 03:20:16 +02001989 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001990 old_length = PyUnicode_WSTR_LENGTH(unicode);
1991 else
1992 old_length = PyUnicode_GET_LENGTH(unicode);
1993 if (old_length == length)
1994 return 0;
1995
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001996 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001997 _Py_INCREF_UNICODE_EMPTY();
1998 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001999 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002000 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002001 return 0;
2002 }
2003
Victor Stinner488fa492011-12-12 00:01:39 +01002004 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 PyObject *copy = resize_copy(unicode, length);
2006 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002007 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002008 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002009 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002010 }
2011
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002013 PyObject *new_unicode = resize_compact(unicode, length);
2014 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002015 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002016 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002018 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002019 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002020}
2021
Alexander Belopolsky40018472011-02-26 01:02:56 +00002022int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002023PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002024{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002025 PyObject *unicode;
2026 if (p_unicode == NULL) {
2027 PyErr_BadInternalCall();
2028 return -1;
2029 }
2030 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002031 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002032 {
2033 PyErr_BadInternalCall();
2034 return -1;
2035 }
2036 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002037}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002038
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002039/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002040
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002041 WARNING: The function doesn't copy the terminating null character and
2042 doesn't check the maximum character (may write a latin1 character in an
2043 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002044static void
2045unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2046 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002047{
2048 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2049 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002050 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002051
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002052 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002053 switch (kind) {
2054 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002055#ifdef Py_DEBUG
2056 if (PyUnicode_IS_ASCII(unicode)) {
2057 Py_UCS4 maxchar = ucs1lib_find_max_char(
2058 (const Py_UCS1*)str,
2059 (const Py_UCS1*)str + len);
2060 assert(maxchar < 128);
2061 }
2062#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002063 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002064 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002065 }
2066 case PyUnicode_2BYTE_KIND: {
2067 Py_UCS2 *start = (Py_UCS2 *)data + index;
2068 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002069
Victor Stinner184252a2012-06-16 02:57:41 +02002070 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002071 *ucs2 = (Py_UCS2)*str;
2072
2073 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002074 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002075 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002076 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002077 Py_UCS4 *start = (Py_UCS4 *)data + index;
2078 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002079
Victor Stinner184252a2012-06-16 02:57:41 +02002080 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002081 *ucs4 = (Py_UCS4)*str;
2082
2083 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002084 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002085 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002086 default:
2087 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002088 }
2089}
2090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091static PyObject*
2092get_latin1_char(unsigned char ch)
2093{
Victor Stinnera464fc12011-10-02 20:39:30 +02002094 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002096 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (!unicode)
2098 return NULL;
2099 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002100 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 unicode_latin1[ch] = unicode;
2102 }
2103 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002104 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002105}
2106
Victor Stinner985a82a2014-01-03 12:53:47 +01002107static PyObject*
2108unicode_char(Py_UCS4 ch)
2109{
2110 PyObject *unicode;
2111
2112 assert(ch <= MAX_UNICODE);
2113
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002114 if (ch < 256)
2115 return get_latin1_char(ch);
2116
Victor Stinner985a82a2014-01-03 12:53:47 +01002117 unicode = PyUnicode_New(1, ch);
2118 if (unicode == NULL)
2119 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002120
2121 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2122 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002123 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002124 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002125 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2126 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2127 }
2128 assert(_PyUnicode_CheckConsistency(unicode, 1));
2129 return unicode;
2130}
2131
Alexander Belopolsky40018472011-02-26 01:02:56 +00002132PyObject *
2133PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002135 if (u == NULL)
2136 return (PyObject*)_PyUnicode_New(size);
2137
2138 if (size < 0) {
2139 PyErr_BadInternalCall();
2140 return NULL;
2141 }
2142
2143 return PyUnicode_FromWideChar(u, size);
2144}
2145
2146PyObject *
2147PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2148{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002149 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 Py_UCS4 maxchar = 0;
2151 Py_ssize_t num_surrogates;
2152
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002153 if (u == NULL && size != 0) {
2154 PyErr_BadInternalCall();
2155 return NULL;
2156 }
2157
2158 if (size == -1) {
2159 size = wcslen(u);
2160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002162 /* If the Unicode data is known at construction time, we can apply
2163 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002166 if (size == 0)
2167 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 /* Single character Unicode objects in the Latin-1 range are
2170 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002171 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return get_latin1_char((unsigned char)*u);
2173
2174 /* If not empty and not single character, copy the Unicode data
2175 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002176 if (find_maxchar_surrogates(u, u + size,
2177 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 return NULL;
2179
Victor Stinner8faf8212011-12-08 22:14:11 +01002180 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 if (!unicode)
2182 return NULL;
2183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 switch (PyUnicode_KIND(unicode)) {
2185 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002186 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2188 break;
2189 case PyUnicode_2BYTE_KIND:
2190#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002191 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002193 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2195#endif
2196 break;
2197 case PyUnicode_4BYTE_KIND:
2198#if SIZEOF_WCHAR_T == 2
2199 /* This is the only case which has to process surrogates, thus
2200 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002201 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202#else
2203 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002204 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205#endif
2206 break;
2207 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002208 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002210
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002211 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212}
2213
Alexander Belopolsky40018472011-02-26 01:02:56 +00002214PyObject *
2215PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002216{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 if (size < 0) {
2218 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002219 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 return NULL;
2221 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002222 if (u != NULL)
2223 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2224 else
2225 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002226}
2227
Alexander Belopolsky40018472011-02-26 01:02:56 +00002228PyObject *
2229PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002230{
2231 size_t size = strlen(u);
2232 if (size > PY_SSIZE_T_MAX) {
2233 PyErr_SetString(PyExc_OverflowError, "input too long");
2234 return NULL;
2235 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002236 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237}
2238
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002239PyObject *
2240_PyUnicode_FromId(_Py_Identifier *id)
2241{
2242 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002243 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2244 strlen(id->string),
2245 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002246 if (!id->object)
2247 return NULL;
2248 PyUnicode_InternInPlace(&id->object);
2249 assert(!id->next);
2250 id->next = static_strings;
2251 static_strings = id;
2252 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002253 return id->object;
2254}
2255
2256void
2257_PyUnicode_ClearStaticStrings()
2258{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002259 _Py_Identifier *tmp, *s = static_strings;
2260 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002261 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002262 tmp = s->next;
2263 s->next = NULL;
2264 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002265 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002266 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002267}
2268
Benjamin Peterson0df54292012-03-26 14:50:32 -04002269/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002270
Victor Stinnerd3f08822012-05-29 12:57:52 +02002271PyObject*
2272_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002273{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002274 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002275 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002276 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002277#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002278 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002279#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002280 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002281 }
Victor Stinner785938e2011-12-11 20:09:03 +01002282 unicode = PyUnicode_New(size, 127);
2283 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002284 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002285 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2286 assert(_PyUnicode_CheckConsistency(unicode, 1));
2287 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002288}
2289
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002290static Py_UCS4
2291kind_maxchar_limit(unsigned int kind)
2292{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002293 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002294 case PyUnicode_1BYTE_KIND:
2295 return 0x80;
2296 case PyUnicode_2BYTE_KIND:
2297 return 0x100;
2298 case PyUnicode_4BYTE_KIND:
2299 return 0x10000;
2300 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002301 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002302 }
2303}
2304
Victor Stinner702c7342011-10-05 13:50:52 +02002305static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002306_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002307{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002308 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002309 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002310
Serhiy Storchaka678db842013-01-26 12:16:36 +02002311 if (size == 0)
2312 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002313 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002314 if (size == 1)
2315 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002316
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002317 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002318 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319 if (!res)
2320 return NULL;
2321 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002322 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002324}
2325
Victor Stinnere57b1c02011-09-28 22:20:48 +02002326static PyObject*
2327_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328{
2329 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002330 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002331
Serhiy Storchaka678db842013-01-26 12:16:36 +02002332 if (size == 0)
2333 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002334 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002335 if (size == 1)
2336 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002337
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002338 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002339 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002340 if (!res)
2341 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002342 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002344 else {
2345 _PyUnicode_CONVERT_BYTES(
2346 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2347 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002348 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 return res;
2350}
2351
Victor Stinnere57b1c02011-09-28 22:20:48 +02002352static PyObject*
2353_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354{
2355 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002356 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002357
Serhiy Storchaka678db842013-01-26 12:16:36 +02002358 if (size == 0)
2359 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002360 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002361 if (size == 1)
2362 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002363
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002364 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002365 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002366 if (!res)
2367 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002368 if (max_char < 256)
2369 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2370 PyUnicode_1BYTE_DATA(res));
2371 else if (max_char < 0x10000)
2372 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2373 PyUnicode_2BYTE_DATA(res));
2374 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002376 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 return res;
2378}
2379
2380PyObject*
2381PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2382{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002383 if (size < 0) {
2384 PyErr_SetString(PyExc_ValueError, "size must be positive");
2385 return NULL;
2386 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002387 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002389 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002391 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002393 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002394 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002395 PyErr_SetString(PyExc_SystemError, "invalid kind");
2396 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398}
2399
Victor Stinnerece58de2012-04-23 23:36:38 +02002400Py_UCS4
2401_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2402{
2403 enum PyUnicode_Kind kind;
2404 void *startptr, *endptr;
2405
2406 assert(PyUnicode_IS_READY(unicode));
2407 assert(0 <= start);
2408 assert(end <= PyUnicode_GET_LENGTH(unicode));
2409 assert(start <= end);
2410
2411 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2412 return PyUnicode_MAX_CHAR_VALUE(unicode);
2413
2414 if (start == end)
2415 return 127;
2416
Victor Stinner94d558b2012-04-27 22:26:58 +02002417 if (PyUnicode_IS_ASCII(unicode))
2418 return 127;
2419
Victor Stinnerece58de2012-04-23 23:36:38 +02002420 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002421 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002422 endptr = (char *)startptr + end * kind;
2423 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002424 switch(kind) {
2425 case PyUnicode_1BYTE_KIND:
2426 return ucs1lib_find_max_char(startptr, endptr);
2427 case PyUnicode_2BYTE_KIND:
2428 return ucs2lib_find_max_char(startptr, endptr);
2429 case PyUnicode_4BYTE_KIND:
2430 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002431 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002432 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002433 }
2434}
2435
Victor Stinner25a4b292011-10-06 12:31:55 +02002436/* Ensure that a string uses the most efficient storage, if it is not the
2437 case: create a new string with of the right kind. Write NULL into *p_unicode
2438 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002439static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002440unicode_adjust_maxchar(PyObject **p_unicode)
2441{
2442 PyObject *unicode, *copy;
2443 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002444 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002445 unsigned int kind;
2446
2447 assert(p_unicode != NULL);
2448 unicode = *p_unicode;
2449 assert(PyUnicode_IS_READY(unicode));
2450 if (PyUnicode_IS_ASCII(unicode))
2451 return;
2452
2453 len = PyUnicode_GET_LENGTH(unicode);
2454 kind = PyUnicode_KIND(unicode);
2455 if (kind == PyUnicode_1BYTE_KIND) {
2456 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002457 max_char = ucs1lib_find_max_char(u, u + len);
2458 if (max_char >= 128)
2459 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002460 }
2461 else if (kind == PyUnicode_2BYTE_KIND) {
2462 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002463 max_char = ucs2lib_find_max_char(u, u + len);
2464 if (max_char >= 256)
2465 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002466 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002467 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002468 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002469 max_char = ucs4lib_find_max_char(u, u + len);
2470 if (max_char >= 0x10000)
2471 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002472 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002473 else
2474 Py_UNREACHABLE();
2475
Victor Stinner25a4b292011-10-06 12:31:55 +02002476 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002477 if (copy != NULL)
2478 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002479 Py_DECREF(unicode);
2480 *p_unicode = copy;
2481}
2482
Victor Stinner034f6cf2011-09-30 02:26:44 +02002483PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002484_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002485{
Victor Stinner87af4f22011-11-21 23:03:47 +01002486 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002487 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002488
Victor Stinner034f6cf2011-09-30 02:26:44 +02002489 if (!PyUnicode_Check(unicode)) {
2490 PyErr_BadInternalCall();
2491 return NULL;
2492 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002493 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002494 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002495
Victor Stinner87af4f22011-11-21 23:03:47 +01002496 length = PyUnicode_GET_LENGTH(unicode);
2497 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002498 if (!copy)
2499 return NULL;
2500 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2501
Christian Heimesf051e432016-09-13 20:22:02 +02002502 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002503 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002504 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002505 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002506}
2507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508
Victor Stinnerbc603d12011-10-02 01:00:40 +02002509/* Widen Unicode objects to larger buffers. Don't write terminating null
2510 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002512static void*
2513unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002514{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002515 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002516
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002517 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002518 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002519 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002520 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002521 if (!result)
2522 return PyErr_NoMemory();
2523 assert(skind == PyUnicode_1BYTE_KIND);
2524 _PyUnicode_CONVERT_BYTES(
2525 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002526 (const Py_UCS1 *)data,
2527 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002528 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002529 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002530 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002531 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002532 if (!result)
2533 return PyErr_NoMemory();
2534 if (skind == PyUnicode_2BYTE_KIND) {
2535 _PyUnicode_CONVERT_BYTES(
2536 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002537 (const Py_UCS2 *)data,
2538 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002539 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002541 else {
2542 assert(skind == PyUnicode_1BYTE_KIND);
2543 _PyUnicode_CONVERT_BYTES(
2544 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002545 (const Py_UCS1 *)data,
2546 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002547 result);
2548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002550 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002551 Py_UNREACHABLE();
2552 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554}
2555
2556static Py_UCS4*
2557as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2558 int copy_null)
2559{
2560 int kind;
2561 void *data;
2562 Py_ssize_t len, targetlen;
2563 if (PyUnicode_READY(string) == -1)
2564 return NULL;
2565 kind = PyUnicode_KIND(string);
2566 data = PyUnicode_DATA(string);
2567 len = PyUnicode_GET_LENGTH(string);
2568 targetlen = len;
2569 if (copy_null)
2570 targetlen++;
2571 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002572 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 if (!target) {
2574 PyErr_NoMemory();
2575 return NULL;
2576 }
2577 }
2578 else {
2579 if (targetsize < targetlen) {
2580 PyErr_Format(PyExc_SystemError,
2581 "string is longer than the buffer");
2582 if (copy_null && 0 < targetsize)
2583 target[0] = 0;
2584 return NULL;
2585 }
2586 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002587 if (kind == PyUnicode_1BYTE_KIND) {
2588 Py_UCS1 *start = (Py_UCS1 *) data;
2589 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002591 else if (kind == PyUnicode_2BYTE_KIND) {
2592 Py_UCS2 *start = (Py_UCS2 *) data;
2593 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2594 }
2595 else {
2596 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002597 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002599 if (copy_null)
2600 target[len] = 0;
2601 return target;
2602}
2603
2604Py_UCS4*
2605PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2606 int copy_null)
2607{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002608 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 PyErr_BadInternalCall();
2610 return NULL;
2611 }
2612 return as_ucs4(string, target, targetsize, copy_null);
2613}
2614
2615Py_UCS4*
2616PyUnicode_AsUCS4Copy(PyObject *string)
2617{
2618 return as_ucs4(string, NULL, 0, 1);
2619}
2620
Victor Stinner15a11362012-10-06 23:48:20 +02002621/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002622 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2623 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2624#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002625
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002626static int
2627unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2628 Py_ssize_t width, Py_ssize_t precision)
2629{
2630 Py_ssize_t length, fill, arglen;
2631 Py_UCS4 maxchar;
2632
2633 if (PyUnicode_READY(str) == -1)
2634 return -1;
2635
2636 length = PyUnicode_GET_LENGTH(str);
2637 if ((precision == -1 || precision >= length)
2638 && width <= length)
2639 return _PyUnicodeWriter_WriteStr(writer, str);
2640
2641 if (precision != -1)
2642 length = Py_MIN(precision, length);
2643
2644 arglen = Py_MAX(length, width);
2645 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2646 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2647 else
2648 maxchar = writer->maxchar;
2649
2650 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2651 return -1;
2652
2653 if (width > length) {
2654 fill = width - length;
2655 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2656 return -1;
2657 writer->pos += fill;
2658 }
2659
2660 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2661 str, 0, length);
2662 writer->pos += length;
2663 return 0;
2664}
2665
2666static int
Victor Stinner998b8062018-09-12 00:23:25 +02002667unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002668 Py_ssize_t width, Py_ssize_t precision)
2669{
2670 /* UTF-8 */
2671 Py_ssize_t length;
2672 PyObject *unicode;
2673 int res;
2674
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002675 if (precision == -1) {
2676 length = strlen(str);
2677 }
2678 else {
2679 length = 0;
2680 while (length < precision && str[length]) {
2681 length++;
2682 }
2683 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002684 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2685 if (unicode == NULL)
2686 return -1;
2687
2688 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2689 Py_DECREF(unicode);
2690 return res;
2691}
2692
Victor Stinner96865452011-03-01 23:44:09 +00002693static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002694unicode_fromformat_arg(_PyUnicodeWriter *writer,
2695 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002696{
Victor Stinnere215d962012-10-06 23:03:36 +02002697 const char *p;
2698 Py_ssize_t len;
2699 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002700 Py_ssize_t width;
2701 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002702 int longflag;
2703 int longlongflag;
2704 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002705 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002706
2707 p = f;
2708 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002709 zeropad = 0;
2710 if (*f == '0') {
2711 zeropad = 1;
2712 f++;
2713 }
Victor Stinner96865452011-03-01 23:44:09 +00002714
2715 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002716 width = -1;
2717 if (Py_ISDIGIT((unsigned)*f)) {
2718 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002719 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002720 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002721 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002722 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002723 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002724 return NULL;
2725 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002727 f++;
2728 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002729 }
2730 precision = -1;
2731 if (*f == '.') {
2732 f++;
2733 if (Py_ISDIGIT((unsigned)*f)) {
2734 precision = (*f - '0');
2735 f++;
2736 while (Py_ISDIGIT((unsigned)*f)) {
2737 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2738 PyErr_SetString(PyExc_ValueError,
2739 "precision too big");
2740 return NULL;
2741 }
2742 precision = (precision * 10) + (*f - '0');
2743 f++;
2744 }
2745 }
Victor Stinner96865452011-03-01 23:44:09 +00002746 if (*f == '%') {
2747 /* "%.3%s" => f points to "3" */
2748 f--;
2749 }
2750 }
2751 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002752 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002753 f--;
2754 }
Victor Stinner96865452011-03-01 23:44:09 +00002755
2756 /* Handle %ld, %lu, %lld and %llu. */
2757 longflag = 0;
2758 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002759 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002760 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002761 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002762 longflag = 1;
2763 ++f;
2764 }
Victor Stinner96865452011-03-01 23:44:09 +00002765 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002766 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002767 longlongflag = 1;
2768 f += 2;
2769 }
Victor Stinner96865452011-03-01 23:44:09 +00002770 }
2771 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002772 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002773 size_tflag = 1;
2774 ++f;
2775 }
Victor Stinnere215d962012-10-06 23:03:36 +02002776
2777 if (f[1] == '\0')
2778 writer->overallocate = 0;
2779
2780 switch (*f) {
2781 case 'c':
2782 {
2783 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002784 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002785 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002786 "character argument not in range(0x110000)");
2787 return NULL;
2788 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002789 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002790 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002791 break;
2792 }
2793
2794 case 'i':
2795 case 'd':
2796 case 'u':
2797 case 'x':
2798 {
2799 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002800 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002802
2803 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002804 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002805 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002806 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002807 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002808 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002809 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002810 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002811 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002812 va_arg(*vargs, size_t));
2813 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002814 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002815 va_arg(*vargs, unsigned int));
2816 }
2817 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002818 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002819 }
2820 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002821 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002822 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002823 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002824 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002825 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002826 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002827 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002828 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002829 va_arg(*vargs, Py_ssize_t));
2830 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002831 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002832 va_arg(*vargs, int));
2833 }
2834 assert(len >= 0);
2835
Victor Stinnere215d962012-10-06 23:03:36 +02002836 if (precision < len)
2837 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002838
2839 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002840 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2841 return NULL;
2842
Victor Stinnere215d962012-10-06 23:03:36 +02002843 if (width > precision) {
2844 Py_UCS4 fillchar;
2845 fill = width - precision;
2846 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002847 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2848 return NULL;
2849 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002850 }
Victor Stinner15a11362012-10-06 23:48:20 +02002851 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002852 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002853 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2854 return NULL;
2855 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002856 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002857
Victor Stinner4a587072013-11-19 12:54:53 +01002858 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2859 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002860 break;
2861 }
2862
2863 case 'p':
2864 {
2865 char number[MAX_LONG_LONG_CHARS];
2866
2867 len = sprintf(number, "%p", va_arg(*vargs, void*));
2868 assert(len >= 0);
2869
2870 /* %p is ill-defined: ensure leading 0x. */
2871 if (number[1] == 'X')
2872 number[1] = 'x';
2873 else if (number[1] != 'x') {
2874 memmove(number + 2, number,
2875 strlen(number) + 1);
2876 number[0] = '0';
2877 number[1] = 'x';
2878 len += 2;
2879 }
2880
Victor Stinner4a587072013-11-19 12:54:53 +01002881 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002882 return NULL;
2883 break;
2884 }
2885
2886 case 's':
2887 {
2888 /* UTF-8 */
2889 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002890 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002891 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002892 break;
2893 }
2894
2895 case 'U':
2896 {
2897 PyObject *obj = va_arg(*vargs, PyObject *);
2898 assert(obj && _PyUnicode_CHECK(obj));
2899
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002900 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002901 return NULL;
2902 break;
2903 }
2904
2905 case 'V':
2906 {
2907 PyObject *obj = va_arg(*vargs, PyObject *);
2908 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002909 if (obj) {
2910 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002911 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002912 return NULL;
2913 }
2914 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002915 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002916 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002917 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002918 }
2919 break;
2920 }
2921
2922 case 'S':
2923 {
2924 PyObject *obj = va_arg(*vargs, PyObject *);
2925 PyObject *str;
2926 assert(obj);
2927 str = PyObject_Str(obj);
2928 if (!str)
2929 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002930 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002931 Py_DECREF(str);
2932 return NULL;
2933 }
2934 Py_DECREF(str);
2935 break;
2936 }
2937
2938 case 'R':
2939 {
2940 PyObject *obj = va_arg(*vargs, PyObject *);
2941 PyObject *repr;
2942 assert(obj);
2943 repr = PyObject_Repr(obj);
2944 if (!repr)
2945 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002946 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002947 Py_DECREF(repr);
2948 return NULL;
2949 }
2950 Py_DECREF(repr);
2951 break;
2952 }
2953
2954 case 'A':
2955 {
2956 PyObject *obj = va_arg(*vargs, PyObject *);
2957 PyObject *ascii;
2958 assert(obj);
2959 ascii = PyObject_ASCII(obj);
2960 if (!ascii)
2961 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002962 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002963 Py_DECREF(ascii);
2964 return NULL;
2965 }
2966 Py_DECREF(ascii);
2967 break;
2968 }
2969
2970 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002971 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002972 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002973 break;
2974
2975 default:
2976 /* if we stumble upon an unknown formatting code, copy the rest
2977 of the format string to the output string. (we cannot just
2978 skip the code, since there's no way to know what's in the
2979 argument list) */
2980 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002981 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002982 return NULL;
2983 f = p+len;
2984 return f;
2985 }
2986
2987 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002988 return f;
2989}
2990
Walter Dörwaldd2034312007-05-18 16:29:38 +00002991PyObject *
2992PyUnicode_FromFormatV(const char *format, va_list vargs)
2993{
Victor Stinnere215d962012-10-06 23:03:36 +02002994 va_list vargs2;
2995 const char *f;
2996 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002997
Victor Stinner8f674cc2013-04-17 23:02:17 +02002998 _PyUnicodeWriter_Init(&writer);
2999 writer.min_length = strlen(format) + 100;
3000 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003001
Benjamin Peterson0c212142016-09-20 20:39:33 -07003002 // Copy varags to be able to pass a reference to a subfunction.
3003 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003004
3005 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003006 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003007 f = unicode_fromformat_arg(&writer, f, &vargs2);
3008 if (f == NULL)
3009 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003011 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003012 const char *p;
3013 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003014
Victor Stinnere215d962012-10-06 23:03:36 +02003015 p = f;
3016 do
3017 {
3018 if ((unsigned char)*p > 127) {
3019 PyErr_Format(PyExc_ValueError,
3020 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3021 "string, got a non-ASCII byte: 0x%02x",
3022 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003023 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003024 }
3025 p++;
3026 }
3027 while (*p != '\0' && *p != '%');
3028 len = p - f;
3029
3030 if (*p == '\0')
3031 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003032
3033 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003034 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003035
3036 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003038 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003039 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003040 return _PyUnicodeWriter_Finish(&writer);
3041
3042 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003043 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003044 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003045 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003046}
3047
Walter Dörwaldd2034312007-05-18 16:29:38 +00003048PyObject *
3049PyUnicode_FromFormat(const char *format, ...)
3050{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003051 PyObject* ret;
3052 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003053
3054#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003055 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003057 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003058#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003059 ret = PyUnicode_FromFormatV(format, vargs);
3060 va_end(vargs);
3061 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003062}
3063
Serhiy Storchakac46db922018-10-23 22:58:24 +03003064static Py_ssize_t
3065unicode_get_widechar_size(PyObject *unicode)
3066{
3067 Py_ssize_t res;
3068
3069 assert(unicode != NULL);
3070 assert(_PyUnicode_CHECK(unicode));
3071
3072 if (_PyUnicode_WSTR(unicode) != NULL) {
3073 return PyUnicode_WSTR_LENGTH(unicode);
3074 }
3075 assert(PyUnicode_IS_READY(unicode));
3076
3077 res = _PyUnicode_LENGTH(unicode);
3078#if SIZEOF_WCHAR_T == 2
3079 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3080 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3081 const Py_UCS4 *end = s + res;
3082 for (; s < end; ++s) {
3083 if (*s > 0xFFFF) {
3084 ++res;
3085 }
3086 }
3087 }
3088#endif
3089 return res;
3090}
3091
3092static void
3093unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3094{
3095 const wchar_t *wstr;
3096
3097 assert(unicode != NULL);
3098 assert(_PyUnicode_CHECK(unicode));
3099
3100 wstr = _PyUnicode_WSTR(unicode);
3101 if (wstr != NULL) {
3102 memcpy(w, wstr, size * sizeof(wchar_t));
3103 return;
3104 }
3105 assert(PyUnicode_IS_READY(unicode));
3106
3107 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3108 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3109 for (; size--; ++s, ++w) {
3110 *w = *s;
3111 }
3112 }
3113 else {
3114#if SIZEOF_WCHAR_T == 4
3115 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3116 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3117 for (; size--; ++s, ++w) {
3118 *w = *s;
3119 }
3120#else
3121 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3122 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3123 for (; size--; ++s, ++w) {
3124 Py_UCS4 ch = *s;
3125 if (ch > 0xFFFF) {
3126 assert(ch <= MAX_UNICODE);
3127 /* encode surrogate pair in this case */
3128 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3129 if (!size--)
3130 break;
3131 *w = Py_UNICODE_LOW_SURROGATE(ch);
3132 }
3133 else {
3134 *w = ch;
3135 }
3136 }
3137#endif
3138 }
3139}
3140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141#ifdef HAVE_WCHAR_H
3142
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003143/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003144
Victor Stinnerd88d9832011-09-06 02:00:05 +02003145 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003146 character) required to convert the unicode object. Ignore size argument.
3147
Victor Stinnerd88d9832011-09-06 02:00:05 +02003148 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003149 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003150 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003151Py_ssize_t
3152PyUnicode_AsWideChar(PyObject *unicode,
3153 wchar_t *w,
3154 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003155{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003156 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003158 if (unicode == NULL) {
3159 PyErr_BadInternalCall();
3160 return -1;
3161 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003162 if (!PyUnicode_Check(unicode)) {
3163 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003164 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003165 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003166
3167 res = unicode_get_widechar_size(unicode);
3168 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003169 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003170 }
3171
3172 if (size > res) {
3173 size = res + 1;
3174 }
3175 else {
3176 res = size;
3177 }
3178 unicode_copy_as_widechar(unicode, w, size);
3179 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003180}
3181
Victor Stinner137c34c2010-09-29 10:25:54 +00003182wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003183PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003184 Py_ssize_t *size)
3185{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003186 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003187 Py_ssize_t buflen;
3188
3189 if (unicode == NULL) {
3190 PyErr_BadInternalCall();
3191 return NULL;
3192 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003193 if (!PyUnicode_Check(unicode)) {
3194 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003195 return NULL;
3196 }
3197
Serhiy Storchakac46db922018-10-23 22:58:24 +03003198 buflen = unicode_get_widechar_size(unicode);
3199 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003200 if (buffer == NULL) {
3201 PyErr_NoMemory();
3202 return NULL;
3203 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003204 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3205 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003206 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 }
3208 else if (wcslen(buffer) != (size_t)buflen) {
3209 PyMem_FREE(buffer);
3210 PyErr_SetString(PyExc_ValueError,
3211 "embedded null character");
3212 return NULL;
3213 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003214 return buffer;
3215}
3216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003217#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219PyObject *
3220PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003221{
Victor Stinner8faf8212011-12-08 22:14:11 +01003222 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 PyErr_SetString(PyExc_ValueError,
3224 "chr() arg not in range(0x110000)");
3225 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003226 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003227
Victor Stinner985a82a2014-01-03 12:53:47 +01003228 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003229}
3230
Alexander Belopolsky40018472011-02-26 01:02:56 +00003231PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003232PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003234 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003236 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003237 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003238 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 Py_INCREF(obj);
3240 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003241 }
3242 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 /* For a Unicode subtype that's not a Unicode object,
3244 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003245 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003246 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003247 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003248 "Can't convert '%.100s' object to str implicitly",
3249 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003250 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003251}
3252
Alexander Belopolsky40018472011-02-26 01:02:56 +00003253PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003254PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003255 const char *encoding,
3256 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003257{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003258 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003259 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003260
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003262 PyErr_BadInternalCall();
3263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003265
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003266 /* Decoding bytes objects is the most common case and should be fast */
3267 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003268 if (PyBytes_GET_SIZE(obj) == 0) {
3269 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3270 return NULL;
3271 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003272 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003273 }
3274 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003275 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3276 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003277 }
3278
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003279 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 PyErr_SetString(PyExc_TypeError,
3281 "decoding str is not supported");
3282 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003283 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003284
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003285 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3286 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3287 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003288 "decoding to str: need a bytes-like object, %.80s found",
3289 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003290 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003291 }
Tim Petersced69f82003-09-16 20:30:58 +00003292
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003293 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003294 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003295 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3296 return NULL;
3297 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003298 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003300
Serhiy Storchaka05997252013-01-26 12:14:02 +02003301 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003302 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003303 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304}
3305
Victor Stinnerebe17e02016-10-12 13:57:45 +02003306/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3307 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3308 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003309int
3310_Py_normalize_encoding(const char *encoding,
3311 char *lower,
3312 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003314 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003315 char *l;
3316 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003317 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318
Victor Stinner942889a2016-09-05 15:40:10 -07003319 assert(encoding != NULL);
3320
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003321 e = encoding;
3322 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003323 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003324 punct = 0;
3325 while (1) {
3326 char c = *e;
3327 if (c == 0) {
3328 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003329 }
Victor Stinner942889a2016-09-05 15:40:10 -07003330
3331 if (Py_ISALNUM(c) || c == '.') {
3332 if (punct && l != lower) {
3333 if (l == l_end) {
3334 return 0;
3335 }
3336 *l++ = '_';
3337 }
3338 punct = 0;
3339
3340 if (l == l_end) {
3341 return 0;
3342 }
3343 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003344 }
3345 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003346 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003347 }
Victor Stinner942889a2016-09-05 15:40:10 -07003348
3349 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003350 }
3351 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003352 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003353}
3354
Alexander Belopolsky40018472011-02-26 01:02:56 +00003355PyObject *
3356PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003357 Py_ssize_t size,
3358 const char *encoding,
3359 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003360{
3361 PyObject *buffer = NULL, *unicode;
3362 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003363 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3364
Victor Stinner22eb6892019-06-26 00:51:05 +02003365 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3366 return NULL;
3367 }
3368
Victor Stinnered076ed2019-06-26 01:49:32 +02003369 if (size == 0) {
3370 _Py_RETURN_UNICODE_EMPTY();
3371 }
3372
Victor Stinner942889a2016-09-05 15:40:10 -07003373 if (encoding == NULL) {
3374 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3375 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003376
Fred Drakee4315f52000-05-09 19:53:39 +00003377 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003378 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3379 char *lower = buflower;
3380
3381 /* Fast paths */
3382 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3383 lower += 3;
3384 if (*lower == '_') {
3385 /* Match "utf8" and "utf_8" */
3386 lower++;
3387 }
3388
3389 if (lower[0] == '8' && lower[1] == 0) {
3390 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3391 }
3392 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3393 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3394 }
3395 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3396 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3397 }
3398 }
3399 else {
3400 if (strcmp(lower, "ascii") == 0
3401 || strcmp(lower, "us_ascii") == 0) {
3402 return PyUnicode_DecodeASCII(s, size, errors);
3403 }
Steve Dowercc16be82016-09-08 10:35:16 -07003404 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003405 else if (strcmp(lower, "mbcs") == 0) {
3406 return PyUnicode_DecodeMBCS(s, size, errors);
3407 }
3408 #endif
3409 else if (strcmp(lower, "latin1") == 0
3410 || strcmp(lower, "latin_1") == 0
3411 || strcmp(lower, "iso_8859_1") == 0
3412 || strcmp(lower, "iso8859_1") == 0) {
3413 return PyUnicode_DecodeLatin1(s, size, errors);
3414 }
3415 }
Victor Stinner37296e82010-06-10 13:36:23 +00003416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417
3418 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003419 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003420 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003421 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003422 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 if (buffer == NULL)
3424 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003425 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 if (unicode == NULL)
3427 goto onError;
3428 if (!PyUnicode_Check(unicode)) {
3429 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003430 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003431 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003432 encoding,
3433 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 Py_DECREF(unicode);
3435 goto onError;
3436 }
3437 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003438 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003439
Benjamin Peterson29060642009-01-31 22:14:21 +00003440 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 Py_XDECREF(buffer);
3442 return NULL;
3443}
3444
Alexander Belopolsky40018472011-02-26 01:02:56 +00003445PyObject *
3446PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003447 const char *encoding,
3448 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003449{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003450 if (!PyUnicode_Check(unicode)) {
3451 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003452 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453 }
3454
Serhiy Storchaka00939072016-10-27 21:05:49 +03003455 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3456 "PyUnicode_AsDecodedObject() is deprecated; "
3457 "use PyCodec_Decode() to decode from str", 1) < 0)
3458 return NULL;
3459
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003460 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003461 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003462
3463 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003464 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003465}
3466
Alexander Belopolsky40018472011-02-26 01:02:56 +00003467PyObject *
3468PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003469 const char *encoding,
3470 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003471{
3472 PyObject *v;
3473
3474 if (!PyUnicode_Check(unicode)) {
3475 PyErr_BadArgument();
3476 goto onError;
3477 }
3478
Serhiy Storchaka00939072016-10-27 21:05:49 +03003479 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3480 "PyUnicode_AsDecodedUnicode() is deprecated; "
3481 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3482 return NULL;
3483
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003484 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003486
3487 /* Decode via the codec registry */
3488 v = PyCodec_Decode(unicode, encoding, errors);
3489 if (v == NULL)
3490 goto onError;
3491 if (!PyUnicode_Check(v)) {
3492 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003493 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003494 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003495 encoding,
3496 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003497 Py_DECREF(v);
3498 goto onError;
3499 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003500 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003501
Benjamin Peterson29060642009-01-31 22:14:21 +00003502 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003503 return NULL;
3504}
3505
Alexander Belopolsky40018472011-02-26 01:02:56 +00003506PyObject *
3507PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003508 Py_ssize_t size,
3509 const char *encoding,
3510 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511{
3512 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003513
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003514 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003517 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3518 Py_DECREF(unicode);
3519 return v;
3520}
3521
Alexander Belopolsky40018472011-02-26 01:02:56 +00003522PyObject *
3523PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003524 const char *encoding,
3525 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003526{
3527 PyObject *v;
3528
3529 if (!PyUnicode_Check(unicode)) {
3530 PyErr_BadArgument();
3531 goto onError;
3532 }
3533
Serhiy Storchaka00939072016-10-27 21:05:49 +03003534 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3535 "PyUnicode_AsEncodedObject() is deprecated; "
3536 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3537 "or PyCodec_Encode() for generic encoding", 1) < 0)
3538 return NULL;
3539
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003540 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003542
3543 /* Encode via the codec registry */
3544 v = PyCodec_Encode(unicode, encoding, errors);
3545 if (v == NULL)
3546 goto onError;
3547 return v;
3548
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003550 return NULL;
3551}
3552
Victor Stinner1b579672011-12-17 05:47:23 +01003553
Victor Stinner2cba6b82018-01-10 22:46:15 +01003554static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003555unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003556 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003557{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003558 Py_ssize_t wlen;
3559 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3560 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003561 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003562 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003563
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003564 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003565 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003566 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003567 return NULL;
3568 }
3569
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003570 char *str;
3571 size_t error_pos;
3572 const char *reason;
3573 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003574 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003575 PyMem_Free(wstr);
3576
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003577 if (res != 0) {
3578 if (res == -2) {
3579 PyObject *exc;
3580 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3581 "locale", unicode,
3582 (Py_ssize_t)error_pos,
3583 (Py_ssize_t)(error_pos+1),
3584 reason);
3585 if (exc != NULL) {
3586 PyCodec_StrictErrors(exc);
3587 Py_DECREF(exc);
3588 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003589 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003590 else if (res == -3) {
3591 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3592 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003593 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003594 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003595 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003596 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003597 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003598
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003599 PyObject *bytes = PyBytes_FromString(str);
3600 PyMem_RawFree(str);
3601 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003602}
3603
Victor Stinnerad158722010-10-27 00:25:46 +00003604PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003605PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3606{
Victor Stinner709d23d2019-05-02 14:56:30 -04003607 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3608 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003609}
3610
3611PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003612PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003613{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003614 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003615 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003616 return unicode_encode_utf8(unicode,
3617 interp->fs_codec.error_handler,
3618 interp->fs_codec.errors);
3619 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003620#ifndef _Py_FORCE_UTF8_FS_ENCODING
3621 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003622 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003623 interp->fs_codec.encoding,
3624 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003625 }
Victor Stinnerad158722010-10-27 00:25:46 +00003626#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003627 else {
3628 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3629 machinery is not ready and so cannot be used:
3630 use wcstombs() in this case. */
3631 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3632 assert(filesystem_errors != NULL);
3633 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3634 assert(errors != _Py_ERROR_UNKNOWN);
3635#ifdef _Py_FORCE_UTF8_FS_ENCODING
3636 return unicode_encode_utf8(unicode, errors, NULL);
3637#else
3638 return unicode_encode_locale(unicode, errors, 0);
3639#endif
3640 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003641}
3642
Alexander Belopolsky40018472011-02-26 01:02:56 +00003643PyObject *
3644PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003645 const char *encoding,
3646 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647{
3648 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003649 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003650
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 if (!PyUnicode_Check(unicode)) {
3652 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 }
Fred Drakee4315f52000-05-09 19:53:39 +00003655
Victor Stinner22eb6892019-06-26 00:51:05 +02003656 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3657 return NULL;
3658 }
3659
Victor Stinner942889a2016-09-05 15:40:10 -07003660 if (encoding == NULL) {
3661 return _PyUnicode_AsUTF8String(unicode, errors);
3662 }
3663
Fred Drakee4315f52000-05-09 19:53:39 +00003664 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003665 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3666 char *lower = buflower;
3667
3668 /* Fast paths */
3669 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3670 lower += 3;
3671 if (*lower == '_') {
3672 /* Match "utf8" and "utf_8" */
3673 lower++;
3674 }
3675
3676 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003678 }
3679 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3680 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3681 }
3682 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3683 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3684 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003685 }
Victor Stinner942889a2016-09-05 15:40:10 -07003686 else {
3687 if (strcmp(lower, "ascii") == 0
3688 || strcmp(lower, "us_ascii") == 0) {
3689 return _PyUnicode_AsASCIIString(unicode, errors);
3690 }
Steve Dowercc16be82016-09-08 10:35:16 -07003691#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003692 else if (strcmp(lower, "mbcs") == 0) {
3693 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3694 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003695#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003696 else if (strcmp(lower, "latin1") == 0 ||
3697 strcmp(lower, "latin_1") == 0 ||
3698 strcmp(lower, "iso_8859_1") == 0 ||
3699 strcmp(lower, "iso8859_1") == 0) {
3700 return _PyUnicode_AsLatin1String(unicode, errors);
3701 }
3702 }
Victor Stinner37296e82010-06-10 13:36:23 +00003703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704
3705 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003706 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003708 return NULL;
3709
3710 /* The normal path */
3711 if (PyBytes_Check(v))
3712 return v;
3713
3714 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003715 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003716 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003717 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003718
3719 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003720 "encoder %s returned bytearray instead of bytes; "
3721 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003722 encoding);
3723 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003724 Py_DECREF(v);
3725 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003726 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003727
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003728 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3729 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003730 Py_DECREF(v);
3731 return b;
3732 }
3733
3734 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003735 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003736 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003737 encoding,
3738 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003739 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003740 return NULL;
3741}
3742
Alexander Belopolsky40018472011-02-26 01:02:56 +00003743PyObject *
3744PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003745 const char *encoding,
3746 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003747{
3748 PyObject *v;
3749
3750 if (!PyUnicode_Check(unicode)) {
3751 PyErr_BadArgument();
3752 goto onError;
3753 }
3754
Serhiy Storchaka00939072016-10-27 21:05:49 +03003755 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3756 "PyUnicode_AsEncodedUnicode() is deprecated; "
3757 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3758 return NULL;
3759
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003760 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003761 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003762
3763 /* Encode via the codec registry */
3764 v = PyCodec_Encode(unicode, encoding, errors);
3765 if (v == NULL)
3766 goto onError;
3767 if (!PyUnicode_Check(v)) {
3768 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003769 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003770 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003771 encoding,
3772 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003773 Py_DECREF(v);
3774 goto onError;
3775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003777
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 return NULL;
3780}
3781
Victor Stinner2cba6b82018-01-10 22:46:15 +01003782static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003783unicode_decode_locale(const char *str, Py_ssize_t len,
3784 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003785{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003786 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3787 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003788 return NULL;
3789 }
3790
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003791 wchar_t *wstr;
3792 size_t wlen;
3793 const char *reason;
3794 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003795 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003796 if (res != 0) {
3797 if (res == -2) {
3798 PyObject *exc;
3799 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3800 "locale", str, len,
3801 (Py_ssize_t)wlen,
3802 (Py_ssize_t)(wlen + 1),
3803 reason);
3804 if (exc != NULL) {
3805 PyCodec_StrictErrors(exc);
3806 Py_DECREF(exc);
3807 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003808 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003809 else if (res == -3) {
3810 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3811 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003812 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003813 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003814 }
Victor Stinner2f197072011-12-17 07:08:30 +01003815 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003816 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003817
3818 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3819 PyMem_RawFree(wstr);
3820 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003821}
3822
3823PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003824PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3825 const char *errors)
3826{
Victor Stinner709d23d2019-05-02 14:56:30 -04003827 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3828 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003829}
3830
3831PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003832PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003833{
3834 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003835 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3836 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003837}
3838
3839
3840PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003841PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003842 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003843 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3844}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845
Christian Heimes5894ba72007-11-04 11:43:14 +00003846PyObject*
3847PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3848{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003849 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003850 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003851 return unicode_decode_utf8(s, size,
3852 interp->fs_codec.error_handler,
3853 interp->fs_codec.errors,
3854 NULL);
3855 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003856#ifndef _Py_FORCE_UTF8_FS_ENCODING
3857 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003858 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003859 interp->fs_codec.encoding,
3860 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003861 }
Victor Stinnerad158722010-10-27 00:25:46 +00003862#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003863 else {
3864 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3865 machinery is not ready and so cannot be used:
3866 use mbstowcs() in this case. */
3867 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3868 assert(filesystem_errors != NULL);
3869 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3870 assert(errors != _Py_ERROR_UNKNOWN);
3871#ifdef _Py_FORCE_UTF8_FS_ENCODING
3872 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3873#else
3874 return unicode_decode_locale(s, size, errors, 0);
3875#endif
3876 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003877}
3878
Martin v. Löwis011e8422009-05-05 04:43:17 +00003879
3880int
3881PyUnicode_FSConverter(PyObject* arg, void* addr)
3882{
Brett Cannonec6ce872016-09-06 15:50:29 -07003883 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003884 PyObject *output = NULL;
3885 Py_ssize_t size;
3886 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003887 if (arg == NULL) {
3888 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003889 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003890 return 1;
3891 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003892 path = PyOS_FSPath(arg);
3893 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003894 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003895 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003896 if (PyBytes_Check(path)) {
3897 output = path;
3898 }
3899 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3900 output = PyUnicode_EncodeFSDefault(path);
3901 Py_DECREF(path);
3902 if (!output) {
3903 return 0;
3904 }
3905 assert(PyBytes_Check(output));
3906 }
3907
Victor Stinner0ea2a462010-04-30 00:22:08 +00003908 size = PyBytes_GET_SIZE(output);
3909 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003910 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003911 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003912 Py_DECREF(output);
3913 return 0;
3914 }
3915 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003916 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003917}
3918
3919
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003920int
3921PyUnicode_FSDecoder(PyObject* arg, void* addr)
3922{
Brett Cannona5711202016-09-06 19:36:01 -07003923 int is_buffer = 0;
3924 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003925 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 if (arg == NULL) {
3927 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003928 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003929 return 1;
3930 }
Brett Cannona5711202016-09-06 19:36:01 -07003931
3932 is_buffer = PyObject_CheckBuffer(arg);
3933 if (!is_buffer) {
3934 path = PyOS_FSPath(arg);
3935 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003936 return 0;
3937 }
Brett Cannona5711202016-09-06 19:36:01 -07003938 }
3939 else {
3940 path = arg;
3941 Py_INCREF(arg);
3942 }
3943
3944 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003945 output = path;
3946 }
3947 else if (PyBytes_Check(path) || is_buffer) {
3948 PyObject *path_bytes = NULL;
3949
3950 if (!PyBytes_Check(path) &&
3951 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003952 "path should be string, bytes, or os.PathLike, not %.200s",
3953 Py_TYPE(arg)->tp_name)) {
3954 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003955 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003956 }
3957 path_bytes = PyBytes_FromObject(path);
3958 Py_DECREF(path);
3959 if (!path_bytes) {
3960 return 0;
3961 }
3962 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3963 PyBytes_GET_SIZE(path_bytes));
3964 Py_DECREF(path_bytes);
3965 if (!output) {
3966 return 0;
3967 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003968 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003969 else {
3970 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003971 "path should be string, bytes, or os.PathLike, not %.200s",
3972 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003973 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003974 return 0;
3975 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003976 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003977 Py_DECREF(output);
3978 return 0;
3979 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003981 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003982 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003983 Py_DECREF(output);
3984 return 0;
3985 }
3986 *(PyObject**)addr = output;
3987 return Py_CLEANUP_SUPPORTED;
3988}
3989
3990
Inada Naoki02a4d572020-02-27 13:48:59 +09003991static int unicode_fill_utf8(PyObject *unicode);
3992
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003993const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003995{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003996 if (!PyUnicode_Check(unicode)) {
3997 PyErr_BadArgument();
3998 return NULL;
3999 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004000 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004001 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004003 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004004 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 return NULL;
4006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 }
4008
4009 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004010 *psize = PyUnicode_UTF8_LENGTH(unicode);
4011 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004012}
4013
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004014const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4018}
4019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020Py_UNICODE *
4021PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 if (!PyUnicode_Check(unicode)) {
4024 PyErr_BadArgument();
4025 return NULL;
4026 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004027 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4028 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004030 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004031 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032
Serhiy Storchakac46db922018-10-23 22:58:24 +03004033 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4034 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4035 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004038 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4039 if (w == NULL) {
4040 PyErr_NoMemory();
4041 return NULL;
4042 }
4043 unicode_copy_as_widechar(unicode, w, wlen + 1);
4044 _PyUnicode_WSTR(unicode) = w;
4045 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4046 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 }
4048 }
4049 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004050 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004051 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004052}
4053
Alexander Belopolsky40018472011-02-26 01:02:56 +00004054Py_UNICODE *
4055PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058}
4059
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004060const Py_UNICODE *
4061_PyUnicode_AsUnicode(PyObject *unicode)
4062{
4063 Py_ssize_t size;
4064 const Py_UNICODE *wstr;
4065
4066 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4067 if (wstr && wcslen(wstr) != (size_t)size) {
4068 PyErr_SetString(PyExc_ValueError, "embedded null character");
4069 return NULL;
4070 }
4071 return wstr;
4072}
4073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074
Alexander Belopolsky40018472011-02-26 01:02:56 +00004075Py_ssize_t
4076PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077{
4078 if (!PyUnicode_Check(unicode)) {
4079 PyErr_BadArgument();
4080 goto onError;
4081 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004082 if (_PyUnicode_WSTR(unicode) == NULL) {
4083 if (PyUnicode_AsUnicode(unicode) == NULL)
4084 goto onError;
4085 }
4086 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 return -1;
4090}
4091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092Py_ssize_t
4093PyUnicode_GetLength(PyObject *unicode)
4094{
Victor Stinner07621332012-06-16 04:53:46 +02004095 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004096 PyErr_BadArgument();
4097 return -1;
4098 }
Victor Stinner07621332012-06-16 04:53:46 +02004099 if (PyUnicode_READY(unicode) == -1)
4100 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101 return PyUnicode_GET_LENGTH(unicode);
4102}
4103
4104Py_UCS4
4105PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4106{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004107 void *data;
4108 int kind;
4109
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004110 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004111 PyErr_BadArgument();
4112 return (Py_UCS4)-1;
4113 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004114 if (PyUnicode_READY(unicode) == -1) {
4115 return (Py_UCS4)-1;
4116 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004117 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004118 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 return (Py_UCS4)-1;
4120 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004121 data = PyUnicode_DATA(unicode);
4122 kind = PyUnicode_KIND(unicode);
4123 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004124}
4125
4126int
4127PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4128{
4129 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004130 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 return -1;
4132 }
Victor Stinner488fa492011-12-12 00:01:39 +01004133 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004134 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004135 PyErr_SetString(PyExc_IndexError, "string index out of range");
4136 return -1;
4137 }
Victor Stinner488fa492011-12-12 00:01:39 +01004138 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004139 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004140 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4141 PyErr_SetString(PyExc_ValueError, "character out of range");
4142 return -1;
4143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4145 index, ch);
4146 return 0;
4147}
4148
Alexander Belopolsky40018472011-02-26 01:02:56 +00004149const char *
4150PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004151{
Victor Stinner42cb4622010-09-01 19:39:01 +00004152 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004153}
4154
Victor Stinner554f3f02010-06-16 23:33:54 +00004155/* create or adjust a UnicodeDecodeError */
4156static void
4157make_decode_exception(PyObject **exceptionObject,
4158 const char *encoding,
4159 const char *input, Py_ssize_t length,
4160 Py_ssize_t startpos, Py_ssize_t endpos,
4161 const char *reason)
4162{
4163 if (*exceptionObject == NULL) {
4164 *exceptionObject = PyUnicodeDecodeError_Create(
4165 encoding, input, length, startpos, endpos, reason);
4166 }
4167 else {
4168 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4169 goto onError;
4170 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4171 goto onError;
4172 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4173 goto onError;
4174 }
4175 return;
4176
4177onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004178 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004179}
4180
Steve Dowercc16be82016-09-08 10:35:16 -07004181#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004182static int
4183widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4184{
4185 if (newsize > *size) {
4186 wchar_t *newbuf = *buf;
4187 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4188 PyErr_NoMemory();
4189 return -1;
4190 }
4191 *buf = newbuf;
4192 }
4193 *size = newsize;
4194 return 0;
4195}
4196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197/* error handling callback helper:
4198 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004199 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 and adjust various state variables.
4201 return 0 on success, -1 on error
4202*/
4203
Alexander Belopolsky40018472011-02-26 01:02:56 +00004204static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004205unicode_decode_call_errorhandler_wchar(
4206 const char *errors, PyObject **errorHandler,
4207 const char *encoding, const char *reason,
4208 const char **input, const char **inend, Py_ssize_t *startinpos,
4209 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004210 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004211{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004212 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213
4214 PyObject *restuple = NULL;
4215 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004216 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004217 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004218 Py_ssize_t requiredsize;
4219 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004220 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004221 wchar_t *repwstr;
4222 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223
4224 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 *errorHandler = PyCodec_LookupError(errors);
4226 if (*errorHandler == NULL)
4227 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 }
4229
Victor Stinner554f3f02010-06-16 23:33:54 +00004230 make_decode_exception(exceptionObject,
4231 encoding,
4232 *input, *inend - *input,
4233 *startinpos, *endinpos,
4234 reason);
4235 if (*exceptionObject == NULL)
4236 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237
Petr Viktorinffd97532020-02-11 17:46:57 +01004238 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004242 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004245 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004247
4248 /* Copy back the bytes variables, which might have been modified by the
4249 callback */
4250 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4251 if (!inputobj)
4252 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253 *input = PyBytes_AS_STRING(inputobj);
4254 insize = PyBytes_GET_SIZE(inputobj);
4255 *inend = *input + insize;
4256 /* we can DECREF safely, as the exception has another reference,
4257 so the object won't go away. */
4258 Py_DECREF(inputobj);
4259
4260 if (newpos<0)
4261 newpos = insize+newpos;
4262 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004263 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264 goto onError;
4265 }
4266
4267 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4268 if (repwstr == NULL)
4269 goto onError;
4270 /* need more space? (at least enough for what we
4271 have+the replacement+the rest of the string (starting
4272 at the new input position), so we won't have to check space
4273 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004274 requiredsize = *outpos;
4275 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4276 goto overflow;
4277 requiredsize += repwlen;
4278 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4279 goto overflow;
4280 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004281 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004283 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004284 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004285 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004287 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004289 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 *endinpos = newpos;
4292 *inptr = *input + newpos;
4293
4294 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 return 0;
4297
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004298 overflow:
4299 PyErr_SetString(PyExc_OverflowError,
4300 "decoded result is too long for a Python string");
4301
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 onError:
4303 Py_XDECREF(restuple);
4304 return -1;
4305}
Steve Dowercc16be82016-09-08 10:35:16 -07004306#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307
4308static int
4309unicode_decode_call_errorhandler_writer(
4310 const char *errors, PyObject **errorHandler,
4311 const char *encoding, const char *reason,
4312 const char **input, const char **inend, Py_ssize_t *startinpos,
4313 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4314 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4315{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004316 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004317
4318 PyObject *restuple = NULL;
4319 PyObject *repunicode = NULL;
4320 Py_ssize_t insize;
4321 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004322 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004323 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004325 int need_to_grow = 0;
4326 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327
4328 if (*errorHandler == NULL) {
4329 *errorHandler = PyCodec_LookupError(errors);
4330 if (*errorHandler == NULL)
4331 goto onError;
4332 }
4333
4334 make_decode_exception(exceptionObject,
4335 encoding,
4336 *input, *inend - *input,
4337 *startinpos, *endinpos,
4338 reason);
4339 if (*exceptionObject == NULL)
4340 goto onError;
4341
Petr Viktorinffd97532020-02-11 17:46:57 +01004342 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 if (restuple == NULL)
4344 goto onError;
4345 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004346 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347 goto onError;
4348 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004349 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004350 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004351
4352 /* Copy back the bytes variables, which might have been modified by the
4353 callback */
4354 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4355 if (!inputobj)
4356 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004357 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004358 *input = PyBytes_AS_STRING(inputobj);
4359 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004360 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004361 /* we can DECREF safely, as the exception has another reference,
4362 so the object won't go away. */
4363 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004366 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004367 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004368 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004370 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371
Victor Stinner170ca6f2013-04-18 00:25:28 +02004372 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004373 if (replen > 1) {
4374 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004375 need_to_grow = 1;
4376 }
4377 new_inptr = *input + newpos;
4378 if (*inend - new_inptr > remain) {
4379 /* We don't know the decoding algorithm here so we make the worst
4380 assumption that one byte decodes to one unicode character.
4381 If unfortunately one byte could decode to more unicode characters,
4382 the decoder may write out-of-bound then. Is it possible for the
4383 algorithms using this function? */
4384 writer->min_length += *inend - new_inptr - remain;
4385 need_to_grow = 1;
4386 }
4387 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004388 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004389 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004390 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4391 goto onError;
4392 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004393 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004394 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004397 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004398
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004400 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004404 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004405 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004406}
4407
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408/* --- UTF-7 Codec -------------------------------------------------------- */
4409
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410/* See RFC2152 for details. We encode conservatively and decode liberally. */
4411
4412/* Three simple macros defining base-64. */
4413
4414/* Is c a base-64 character? */
4415
4416#define IS_BASE64(c) \
4417 (((c) >= 'A' && (c) <= 'Z') || \
4418 ((c) >= 'a' && (c) <= 'z') || \
4419 ((c) >= '0' && (c) <= '9') || \
4420 (c) == '+' || (c) == '/')
4421
4422/* given that c is a base-64 character, what is its base-64 value? */
4423
4424#define FROM_BASE64(c) \
4425 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4426 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4427 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4428 (c) == '+' ? 62 : 63)
4429
4430/* What is the base-64 character of the bottom 6 bits of n? */
4431
4432#define TO_BASE64(n) \
4433 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4434
4435/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4436 * decoded as itself. We are permissive on decoding; the only ASCII
4437 * byte not decoding to itself is the + which begins a base64
4438 * string. */
4439
4440#define DECODE_DIRECT(c) \
4441 ((c) <= 127 && (c) != '+')
4442
4443/* The UTF-7 encoder treats ASCII characters differently according to
4444 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4445 * the above). See RFC2152. This array identifies these different
4446 * sets:
4447 * 0 : "Set D"
4448 * alphanumeric and '(),-./:?
4449 * 1 : "Set O"
4450 * !"#$%&*;<=>@[]^_`{|}
4451 * 2 : "whitespace"
4452 * ht nl cr sp
4453 * 3 : special (must be base64 encoded)
4454 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4455 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456
Tim Petersced69f82003-09-16 20:30:58 +00004457static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458char utf7_category[128] = {
4459/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4460 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4461/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4462 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4463/* sp ! " # $ % & ' ( ) * + , - . / */
4464 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4465/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4466 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4467/* @ A B C D E F G H I J K L M N O */
4468 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4469/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4471/* ` a b c d e f g h i j k l m n o */
4472 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4473/* p q r s t u v w x y z { | } ~ del */
4474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475};
4476
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477/* ENCODE_DIRECT: this character should be encoded as itself. The
4478 * answer depends on whether we are encoding set O as itself, and also
4479 * on whether we are encoding whitespace as itself. RFC2152 makes it
4480 * clear that the answers to these questions vary between
4481 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004482
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483#define ENCODE_DIRECT(c, directO, directWS) \
4484 ((c) < 128 && (c) > 0 && \
4485 ((utf7_category[(c)] == 0) || \
4486 (directWS && (utf7_category[(c)] == 2)) || \
4487 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488
Alexander Belopolsky40018472011-02-26 01:02:56 +00004489PyObject *
4490PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004491 Py_ssize_t size,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004494 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4495}
4496
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497/* The decoder. The only state we preserve is our read position,
4498 * i.e. how many characters we have consumed. So if we end in the
4499 * middle of a shift sequence we have to back off the read position
4500 * and the output to the beginning of the sequence, otherwise we lose
4501 * all the shift state (seen bits, number of bits seen, high
4502 * surrogate). */
4503
Alexander Belopolsky40018472011-02-26 01:02:56 +00004504PyObject *
4505PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004506 Py_ssize_t size,
4507 const char *errors,
4508 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004509{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004511 Py_ssize_t startinpos;
4512 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004514 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515 const char *errmsg = "";
4516 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004517 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 unsigned int base64bits = 0;
4519 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004520 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521 PyObject *errorHandler = NULL;
4522 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004524 if (size == 0) {
4525 if (consumed)
4526 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004527 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004528 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004530 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004531 _PyUnicodeWriter_Init(&writer);
4532 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533
4534 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 e = s + size;
4536
4537 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004538 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004540 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 if (inShift) { /* in a base-64 section */
4543 if (IS_BASE64(ch)) { /* consume a base-64 character */
4544 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4545 base64bits += 6;
4546 s++;
4547 if (base64bits >= 16) {
4548 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004549 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004550 base64bits -= 16;
4551 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004552 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 if (surrogate) {
4554 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004555 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4556 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004557 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004558 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004560 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 }
4562 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004563 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004564 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 }
4567 }
Victor Stinner551ac952011-11-29 22:58:13 +01004568 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 /* first surrogate */
4570 surrogate = outCh;
4571 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004573 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004574 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 }
4576 }
4577 }
4578 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 if (base64bits > 0) { /* left-over bits */
4581 if (base64bits >= 6) {
4582 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004583 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 errmsg = "partial character in shift sequence";
4585 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 else {
4588 /* Some bits remain; they should be zero */
4589 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004590 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 errmsg = "non-zero padding bits in shift sequence";
4592 goto utf7Error;
4593 }
4594 }
4595 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004596 if (surrogate && DECODE_DIRECT(ch)) {
4597 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4598 goto onError;
4599 }
4600 surrogate = 0;
4601 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 /* '-' is absorbed; other terminating
4603 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004604 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606 }
4607 }
4608 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 s++; /* consume '+' */
4611 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004612 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004613 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004616 else if (s < e && !IS_BASE64(*s)) {
4617 s++;
4618 errmsg = "ill-formed sequence";
4619 goto utf7Error;
4620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004622 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004623 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004624 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004626 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004627 }
4628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004631 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004632 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 else {
4635 startinpos = s-starts;
4636 s++;
4637 errmsg = "unexpected special character";
4638 goto utf7Error;
4639 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004642 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004643 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 errors, &errorHandler,
4645 "utf7", errmsg,
4646 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 }
4650
Antoine Pitrou244651a2009-05-04 18:56:13 +00004651 /* end of string */
4652
4653 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4654 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004655 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656 if (surrogate ||
4657 (base64bits >= 6) ||
4658 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004660 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661 errors, &errorHandler,
4662 "utf7", "unterminated shift sequence",
4663 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004664 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 goto onError;
4666 if (s < e)
4667 goto restart;
4668 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004670
4671 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004672 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004674 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004675 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004676 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004677 writer.kind, writer.data, shiftOutStart);
4678 Py_XDECREF(errorHandler);
4679 Py_XDECREF(exc);
4680 _PyUnicodeWriter_Dealloc(&writer);
4681 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004682 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004683 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 }
4685 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004686 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004690 Py_XDECREF(errorHandler);
4691 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004692 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693
Benjamin Peterson29060642009-01-31 22:14:21 +00004694 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004695 Py_XDECREF(errorHandler);
4696 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004697 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004698 return NULL;
4699}
4700
4701
Alexander Belopolsky40018472011-02-26 01:02:56 +00004702PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004703_PyUnicode_EncodeUTF7(PyObject *str,
4704 int base64SetO,
4705 int base64WhiteSpace,
4706 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004708 int kind;
4709 void *data;
4710 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004711 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004713 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 unsigned int base64bits = 0;
4715 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004716 char * out;
4717 char * start;
4718
Benjamin Petersonbac79492012-01-14 13:34:47 -05004719 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004720 return NULL;
4721 kind = PyUnicode_KIND(str);
4722 data = PyUnicode_DATA(str);
4723 len = PyUnicode_GET_LENGTH(str);
4724
4725 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004728 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004729 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004730 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004731 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004732 if (v == NULL)
4733 return NULL;
4734
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004735 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004736 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004737 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004738
Antoine Pitrou244651a2009-05-04 18:56:13 +00004739 if (inShift) {
4740 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4741 /* shifting out */
4742 if (base64bits) { /* output remaining bits */
4743 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4744 base64buffer = 0;
4745 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746 }
4747 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004748 /* Characters not in the BASE64 set implicitly unshift the sequence
4749 so no '-' is required, except if the character is itself a '-' */
4750 if (IS_BASE64(ch) || ch == '-') {
4751 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004753 *out++ = (char) ch;
4754 }
4755 else {
4756 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004757 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004759 else { /* not in a shift sequence */
4760 if (ch == '+') {
4761 *out++ = '+';
4762 *out++ = '-';
4763 }
4764 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4765 *out++ = (char) ch;
4766 }
4767 else {
4768 *out++ = '+';
4769 inShift = 1;
4770 goto encode_char;
4771 }
4772 }
4773 continue;
4774encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004776 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004777
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 /* code first surrogate */
4779 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004780 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 while (base64bits >= 6) {
4782 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4783 base64bits -= 6;
4784 }
4785 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004786 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 base64bits += 16;
4789 base64buffer = (base64buffer << 16) | ch;
4790 while (base64bits >= 6) {
4791 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4792 base64bits -= 6;
4793 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004794 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 if (base64bits)
4796 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4797 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004798 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004799 if (_PyBytes_Resize(&v, out - start) < 0)
4800 return NULL;
4801 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004802}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004803PyObject *
4804PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4805 Py_ssize_t size,
4806 int base64SetO,
4807 int base64WhiteSpace,
4808 const char *errors)
4809{
4810 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004811 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004812 if (tmp == NULL)
4813 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004814 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004815 base64WhiteSpace, errors);
4816 Py_DECREF(tmp);
4817 return result;
4818}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004819
Antoine Pitrou244651a2009-05-04 18:56:13 +00004820#undef IS_BASE64
4821#undef FROM_BASE64
4822#undef TO_BASE64
4823#undef DECODE_DIRECT
4824#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004825
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826/* --- UTF-8 Codec -------------------------------------------------------- */
4827
Alexander Belopolsky40018472011-02-26 01:02:56 +00004828PyObject *
4829PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004830 Py_ssize_t size,
4831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
Walter Dörwald69652032004-09-07 20:24:22 +00004833 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4834}
4835
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836#include "stringlib/asciilib.h"
4837#include "stringlib/codecs.h"
4838#include "stringlib/undef.h"
4839
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004840#include "stringlib/ucs1lib.h"
4841#include "stringlib/codecs.h"
4842#include "stringlib/undef.h"
4843
4844#include "stringlib/ucs2lib.h"
4845#include "stringlib/codecs.h"
4846#include "stringlib/undef.h"
4847
4848#include "stringlib/ucs4lib.h"
4849#include "stringlib/codecs.h"
4850#include "stringlib/undef.h"
4851
Antoine Pitrouab868312009-01-10 15:40:25 +00004852/* Mask to quickly check whether a C 'long' contains a
4853 non-ASCII, UTF8-encoded char. */
4854#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004855# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004856#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004857# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004858#else
4859# error C 'long' size should be either 4 or 8!
4860#endif
4861
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004862static Py_ssize_t
4863ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004864{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004865 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004866 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004867
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004868 /*
4869 * Issue #17237: m68k is a bit different from most architectures in
4870 * that objects do not use "natural alignment" - for example, int and
4871 * long are only aligned at 2-byte boundaries. Therefore the assert()
4872 * won't work; also, tests have shown that skipping the "optimised
4873 * version" will even speed up m68k.
4874 */
4875#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004876#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004877 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4878 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879 /* Fast path, see in STRINGLIB(utf8_decode) for
4880 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004881 /* Help allocation */
4882 const char *_p = p;
4883 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 while (_p < aligned_end) {
4885 unsigned long value = *(const unsigned long *) _p;
4886 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 *((unsigned long *)q) = value;
4889 _p += SIZEOF_LONG;
4890 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004891 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 p = _p;
4893 while (p < end) {
4894 if ((unsigned char)*p & 0x80)
4895 break;
4896 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004898 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004901#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 while (p < end) {
4903 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4904 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004905 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004906 /* Help allocation */
4907 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004909 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 if (value & ASCII_CHAR_MASK)
4911 break;
4912 _p += SIZEOF_LONG;
4913 }
4914 p = _p;
4915 if (_p == end)
4916 break;
4917 }
4918 if ((unsigned char)*p & 0x80)
4919 break;
4920 ++p;
4921 }
4922 memcpy(dest, start, p - start);
4923 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924}
Antoine Pitrouab868312009-01-10 15:40:25 +00004925
Victor Stinner709d23d2019-05-02 14:56:30 -04004926static PyObject *
4927unicode_decode_utf8(const char *s, Py_ssize_t size,
4928 _Py_error_handler error_handler, const char *errors,
4929 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004930{
Victor Stinner785938e2011-12-11 20:09:03 +01004931 if (size == 0) {
4932 if (consumed)
4933 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004934 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004935 }
4936
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004937 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4938 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004939 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 *consumed = 1;
4941 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004942 }
4943
Inada Naoki770847a2019-06-24 12:30:24 +09004944 const char *starts = s;
4945 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004946
Inada Naoki770847a2019-06-24 12:30:24 +09004947 // fast path: try ASCII string.
4948 PyObject *u = PyUnicode_New(size, 127);
4949 if (u == NULL) {
4950 return NULL;
4951 }
4952 s += ascii_decode(s, end, PyUnicode_DATA(u));
4953 if (s == end) {
4954 return u;
4955 }
4956
4957 // Use _PyUnicodeWriter after fast path is failed.
4958 _PyUnicodeWriter writer;
4959 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4960 writer.pos = s - starts;
4961
4962 Py_ssize_t startinpos, endinpos;
4963 const char *errmsg = "";
4964 PyObject *error_handler_obj = NULL;
4965 PyObject *exc = NULL;
4966
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004967 while (s < end) {
4968 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004970
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004971 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004972 if (PyUnicode_IS_ASCII(writer.buffer))
4973 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004975 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004976 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004977 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004978 } else {
4979 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004981 }
4982
4983 switch (ch) {
4984 case 0:
4985 if (s == end || consumed)
4986 goto End;
4987 errmsg = "unexpected end of data";
4988 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004989 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 break;
4991 case 1:
4992 errmsg = "invalid start byte";
4993 startinpos = s - starts;
4994 endinpos = startinpos + 1;
4995 break;
4996 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03004997 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4998 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4999 {
5000 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005001 goto End;
5002 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005003 /* fall through */
5004 case 3:
5005 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 errmsg = "invalid continuation byte";
5007 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005008 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 break;
5010 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005011 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 goto onError;
5013 continue;
5014 }
5015
Victor Stinner1d65d912015-10-05 13:43:50 +02005016 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005017 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005018
5019 switch (error_handler) {
5020 case _Py_ERROR_IGNORE:
5021 s += (endinpos - startinpos);
5022 break;
5023
5024 case _Py_ERROR_REPLACE:
5025 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5026 goto onError;
5027 s += (endinpos - startinpos);
5028 break;
5029
5030 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005031 {
5032 Py_ssize_t i;
5033
Victor Stinner1d65d912015-10-05 13:43:50 +02005034 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5035 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005036 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005037 ch = (Py_UCS4)(unsigned char)(starts[i]);
5038 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5039 ch + 0xdc00);
5040 writer.pos++;
5041 }
5042 s += (endinpos - startinpos);
5043 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005044 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005045
5046 default:
5047 if (unicode_decode_call_errorhandler_writer(
5048 errors, &error_handler_obj,
5049 "utf-8", errmsg,
5050 &starts, &end, &startinpos, &endinpos, &exc, &s,
5051 &writer))
5052 goto onError;
5053 }
Victor Stinner785938e2011-12-11 20:09:03 +01005054 }
5055
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005056End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057 if (consumed)
5058 *consumed = s - starts;
5059
Victor Stinner1d65d912015-10-05 13:43:50 +02005060 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005061 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005062 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005063
5064onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005065 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005066 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005067 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005069}
5070
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005071
Victor Stinner709d23d2019-05-02 14:56:30 -04005072PyObject *
5073PyUnicode_DecodeUTF8Stateful(const char *s,
5074 Py_ssize_t size,
5075 const char *errors,
5076 Py_ssize_t *consumed)
5077{
5078 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5079}
5080
5081
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005082/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5083 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005084
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005085 On success, write a pointer to a newly allocated wide character string into
5086 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5087 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005089 On memory allocation failure, return -1.
5090
5091 On decoding error (if surrogateescape is zero), return -2. If wlen is
5092 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5093 is not NULL, write the decoding error message into *reason. */
5094int
5095_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005096 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005097{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005098 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005099 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005100 wchar_t *unicode;
5101 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102
Victor Stinner3d4226a2018-08-29 22:21:32 +02005103 int surrogateescape = 0;
5104 int surrogatepass = 0;
5105 switch (errors)
5106 {
5107 case _Py_ERROR_STRICT:
5108 break;
5109 case _Py_ERROR_SURROGATEESCAPE:
5110 surrogateescape = 1;
5111 break;
5112 case _Py_ERROR_SURROGATEPASS:
5113 surrogatepass = 1;
5114 break;
5115 default:
5116 return -3;
5117 }
5118
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119 /* Note: size will always be longer than the resulting Unicode
5120 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005121 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005122 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005123 }
5124
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005125 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005126 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005127 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005128 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129
5130 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005131 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005132 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005133 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005134 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005135#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 if (ch > 0xFF) {
5141#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005142 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005144 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005146 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5147 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5148#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005149 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005150 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005151 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005152 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005153 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005154
5155 if (surrogateescape) {
5156 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5157 }
5158 else {
5159 /* Is it a valid three-byte code? */
5160 if (surrogatepass
5161 && (e - s) >= 3
5162 && (s[0] & 0xf0) == 0xe0
5163 && (s[1] & 0xc0) == 0x80
5164 && (s[2] & 0xc0) == 0x80)
5165 {
5166 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5167 s += 3;
5168 unicode[outpos++] = ch;
5169 }
5170 else {
5171 PyMem_RawFree(unicode );
5172 if (reason != NULL) {
5173 switch (ch) {
5174 case 0:
5175 *reason = "unexpected end of data";
5176 break;
5177 case 1:
5178 *reason = "invalid start byte";
5179 break;
5180 /* 2, 3, 4 */
5181 default:
5182 *reason = "invalid continuation byte";
5183 break;
5184 }
5185 }
5186 if (wlen != NULL) {
5187 *wlen = s - orig_s;
5188 }
5189 return -2;
5190 }
5191 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005192 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005193 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005194 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005195 if (wlen) {
5196 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005197 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005198 *wstr = unicode;
5199 return 0;
5200}
5201
Victor Stinner5f9cf232019-03-19 01:46:25 +01005202
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005203wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005204_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5205 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005206{
5207 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005208 int res = _Py_DecodeUTF8Ex(arg, arglen,
5209 &wstr, wlen,
5210 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005211 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005212 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5213 assert(res != -3);
5214 if (wlen) {
5215 *wlen = (size_t)res;
5216 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005217 return NULL;
5218 }
5219 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005220}
5221
Antoine Pitrouab868312009-01-10 15:40:25 +00005222
Victor Stinnere47e6982017-12-21 15:45:16 +01005223/* UTF-8 encoder using the surrogateescape error handler .
5224
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005225 On success, return 0 and write the newly allocated character string (use
5226 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005227
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005228 On encoding failure, return -2 and write the position of the invalid
5229 surrogate character into *error_pos (if error_pos is set) and the decoding
5230 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005231
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005232 On memory allocation failure, return -1. */
5233int
5234_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005235 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005236{
5237 const Py_ssize_t max_char_size = 4;
5238 Py_ssize_t len = wcslen(text);
5239
5240 assert(len >= 0);
5241
Victor Stinner3d4226a2018-08-29 22:21:32 +02005242 int surrogateescape = 0;
5243 int surrogatepass = 0;
5244 switch (errors)
5245 {
5246 case _Py_ERROR_STRICT:
5247 break;
5248 case _Py_ERROR_SURROGATEESCAPE:
5249 surrogateescape = 1;
5250 break;
5251 case _Py_ERROR_SURROGATEPASS:
5252 surrogatepass = 1;
5253 break;
5254 default:
5255 return -3;
5256 }
5257
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005258 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5259 return -1;
5260 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005261 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005262 if (raw_malloc) {
5263 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005264 }
5265 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005266 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005267 }
5268 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005269 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005270 }
5271
5272 char *p = bytes;
5273 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005274 for (i = 0; i < len; ) {
5275 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005276 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005277 i++;
5278#if Py_UNICODE_SIZE == 2
5279 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5280 && i < len
5281 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5282 {
5283 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5284 i++;
5285 }
5286#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005287
5288 if (ch < 0x80) {
5289 /* Encode ASCII */
5290 *p++ = (char) ch;
5291
5292 }
5293 else if (ch < 0x0800) {
5294 /* Encode Latin-1 */
5295 *p++ = (char)(0xc0 | (ch >> 6));
5296 *p++ = (char)(0x80 | (ch & 0x3f));
5297 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005298 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005299 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005300 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005301 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005302 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005303 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 if (reason != NULL) {
5305 *reason = "encoding error";
5306 }
5307 if (raw_malloc) {
5308 PyMem_RawFree(bytes);
5309 }
5310 else {
5311 PyMem_Free(bytes);
5312 }
5313 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 }
5315 *p++ = (char)(ch & 0xff);
5316 }
5317 else if (ch < 0x10000) {
5318 *p++ = (char)(0xe0 | (ch >> 12));
5319 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5320 *p++ = (char)(0x80 | (ch & 0x3f));
5321 }
5322 else { /* ch >= 0x10000 */
5323 assert(ch <= MAX_UNICODE);
5324 /* Encode UCS4 Unicode ordinals */
5325 *p++ = (char)(0xf0 | (ch >> 18));
5326 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5327 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5328 *p++ = (char)(0x80 | (ch & 0x3f));
5329 }
5330 }
5331 *p++ = '\0';
5332
5333 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005334 char *bytes2;
5335 if (raw_malloc) {
5336 bytes2 = PyMem_RawRealloc(bytes, final_size);
5337 }
5338 else {
5339 bytes2 = PyMem_Realloc(bytes, final_size);
5340 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005341 if (bytes2 == NULL) {
5342 if (error_pos != NULL) {
5343 *error_pos = (size_t)-1;
5344 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005345 if (raw_malloc) {
5346 PyMem_RawFree(bytes);
5347 }
5348 else {
5349 PyMem_Free(bytes);
5350 }
5351 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005352 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005353 *str = bytes2;
5354 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005355}
5356
5357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005358/* Primary internal function which creates utf8 encoded bytes objects.
5359
5360 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005361 and allocate exactly as much space needed at the end. Else allocate the
5362 maximum possible needed (4 result bytes per Unicode character), and return
5363 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005364*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005365static PyObject *
5366unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5367 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005369 if (!PyUnicode_Check(unicode)) {
5370 PyErr_BadArgument();
5371 return NULL;
5372 }
5373
5374 if (PyUnicode_READY(unicode) == -1)
5375 return NULL;
5376
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005377 if (PyUnicode_UTF8(unicode))
5378 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5379 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005380
Inada Naoki02a4d572020-02-27 13:48:59 +09005381 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5382 void *data = PyUnicode_DATA(unicode);
5383 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5384
5385 _PyBytesWriter writer;
5386 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005387
Benjamin Petersonead6b532011-12-20 17:23:42 -06005388 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005389 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005390 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005391 case PyUnicode_1BYTE_KIND:
5392 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5393 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005394 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5395 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005396 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005397 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5398 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005399 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005400 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5401 break;
Tim Peters602f7402002-04-27 18:03:26 +00005402 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005403
5404 if (end == NULL) {
5405 _PyBytesWriter_Dealloc(&writer);
5406 return NULL;
5407 }
5408 return _PyBytesWriter_Finish(&writer, end);
5409}
5410
5411static int
5412unicode_fill_utf8(PyObject *unicode)
5413{
5414 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5415 assert(!PyUnicode_IS_ASCII(unicode));
5416
5417 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5418 void *data = PyUnicode_DATA(unicode);
5419 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5420
5421 _PyBytesWriter writer;
5422 char *end;
5423
5424 switch (kind) {
5425 default:
5426 Py_UNREACHABLE();
5427 case PyUnicode_1BYTE_KIND:
5428 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5429 _Py_ERROR_STRICT, NULL);
5430 break;
5431 case PyUnicode_2BYTE_KIND:
5432 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5433 _Py_ERROR_STRICT, NULL);
5434 break;
5435 case PyUnicode_4BYTE_KIND:
5436 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5437 _Py_ERROR_STRICT, NULL);
5438 break;
5439 }
5440 if (end == NULL) {
5441 _PyBytesWriter_Dealloc(&writer);
5442 return -1;
5443 }
5444
5445 char *start = writer.use_small_buffer ? writer.small_buffer :
5446 PyBytes_AS_STRING(writer.buffer);
5447 Py_ssize_t len = end - start;
5448
5449 char *cache = PyObject_MALLOC(len + 1);
5450 if (cache == NULL) {
5451 _PyBytesWriter_Dealloc(&writer);
5452 PyErr_NoMemory();
5453 return -1;
5454 }
5455 _PyUnicode_UTF8(unicode) = cache;
5456 _PyUnicode_UTF8_LENGTH(unicode) = len;
5457 memcpy(cache, start, len);
5458 cache[len] = '\0';
5459 _PyBytesWriter_Dealloc(&writer);
5460 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461}
5462
Alexander Belopolsky40018472011-02-26 01:02:56 +00005463PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005464_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5465{
5466 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5467}
5468
5469
5470PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005471PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5472 Py_ssize_t size,
5473 const char *errors)
5474{
5475 PyObject *v, *unicode;
5476
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005477 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 if (unicode == NULL)
5479 return NULL;
5480 v = _PyUnicode_AsUTF8String(unicode, errors);
5481 Py_DECREF(unicode);
5482 return v;
5483}
5484
5485PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005486PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489}
5490
Walter Dörwald41980ca2007-08-16 21:55:45 +00005491/* --- UTF-32 Codec ------------------------------------------------------- */
5492
5493PyObject *
5494PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 Py_ssize_t size,
5496 const char *errors,
5497 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005498{
5499 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5500}
5501
5502PyObject *
5503PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 Py_ssize_t size,
5505 const char *errors,
5506 int *byteorder,
5507 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005508{
5509 const char *starts = s;
5510 Py_ssize_t startinpos;
5511 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005512 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005513 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005514 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005516 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005517 PyObject *errorHandler = NULL;
5518 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005519
Andy Lestere6be9b52020-02-11 20:28:35 -06005520 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005521 e = q + size;
5522
5523 if (byteorder)
5524 bo = *byteorder;
5525
5526 /* Check for BOM marks (U+FEFF) in the input and adjust current
5527 byte order setting accordingly. In native mode, the leading BOM
5528 mark is skipped, in all other modes, it is copied to the output
5529 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005530 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005531 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005532 if (bom == 0x0000FEFF) {
5533 bo = -1;
5534 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005535 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005536 else if (bom == 0xFFFE0000) {
5537 bo = 1;
5538 q += 4;
5539 }
5540 if (byteorder)
5541 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005542 }
5543
Victor Stinnere64322e2012-10-30 23:12:47 +01005544 if (q == e) {
5545 if (consumed)
5546 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005547 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548 }
5549
Victor Stinnere64322e2012-10-30 23:12:47 +01005550#ifdef WORDS_BIGENDIAN
5551 le = bo < 0;
5552#else
5553 le = bo <= 0;
5554#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005556
Victor Stinner8f674cc2013-04-17 23:02:17 +02005557 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005558 writer.min_length = (e - q + 3) / 4;
5559 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005560 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005561
Victor Stinnere64322e2012-10-30 23:12:47 +01005562 while (1) {
5563 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005564 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005565
Victor Stinnere64322e2012-10-30 23:12:47 +01005566 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005567 enum PyUnicode_Kind kind = writer.kind;
5568 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005569 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005570 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005571 if (le) {
5572 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005573 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 if (ch > maxch)
5575 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005576 if (kind != PyUnicode_1BYTE_KIND &&
5577 Py_UNICODE_IS_SURROGATE(ch))
5578 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005579 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005580 q += 4;
5581 } while (q <= last);
5582 }
5583 else {
5584 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005585 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005586 if (ch > maxch)
5587 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005588 if (kind != PyUnicode_1BYTE_KIND &&
5589 Py_UNICODE_IS_SURROGATE(ch))
5590 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005592 q += 4;
5593 } while (q <= last);
5594 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005595 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005596 }
5597
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005598 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005599 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005600 startinpos = ((const char *)q) - starts;
5601 endinpos = startinpos + 4;
5602 }
5603 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005604 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005606 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005608 startinpos = ((const char *)q) - starts;
5609 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005611 else {
5612 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005613 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005614 goto onError;
5615 q += 4;
5616 continue;
5617 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005618 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005619 startinpos = ((const char *)q) - starts;
5620 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005622
5623 /* The remaining input chars are ignored if the callback
5624 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005625 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005627 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005629 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005631 }
5632
Walter Dörwald41980ca2007-08-16 21:55:45 +00005633 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005635
Walter Dörwald41980ca2007-08-16 21:55:45 +00005636 Py_XDECREF(errorHandler);
5637 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005638 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005639
Benjamin Peterson29060642009-01-31 22:14:21 +00005640 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005642 Py_XDECREF(errorHandler);
5643 Py_XDECREF(exc);
5644 return NULL;
5645}
5646
5647PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005648_PyUnicode_EncodeUTF32(PyObject *str,
5649 const char *errors,
5650 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005651{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005652 enum PyUnicode_Kind kind;
5653 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005654 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005655 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005656 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005657#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005659#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005660 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005661#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005663 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005664 PyObject *errorHandler = NULL;
5665 PyObject *exc = NULL;
5666 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005667
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005668 if (!PyUnicode_Check(str)) {
5669 PyErr_BadArgument();
5670 return NULL;
5671 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005672 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005673 return NULL;
5674 kind = PyUnicode_KIND(str);
5675 data = PyUnicode_DATA(str);
5676 len = PyUnicode_GET_LENGTH(str);
5677
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005678 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005679 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005680 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005681 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005682 if (v == NULL)
5683 return NULL;
5684
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005685 /* output buffer is 4-bytes aligned */
5686 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005687 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005688 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005689 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005691 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005692
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005693 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005694 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005695 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005696 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005697 else
5698 encoding = "utf-32";
5699
5700 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005701 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5702 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005703 }
5704
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005705 pos = 0;
5706 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005707 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005708
5709 if (kind == PyUnicode_2BYTE_KIND) {
5710 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5711 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005712 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005713 else {
5714 assert(kind == PyUnicode_4BYTE_KIND);
5715 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5716 &out, native_ordering);
5717 }
5718 if (pos == len)
5719 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005720
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005721 rep = unicode_encode_call_errorhandler(
5722 errors, &errorHandler,
5723 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005724 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005725 if (!rep)
5726 goto error;
5727
5728 if (PyBytes_Check(rep)) {
5729 repsize = PyBytes_GET_SIZE(rep);
5730 if (repsize & 3) {
5731 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005732 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733 "surrogates not allowed");
5734 goto error;
5735 }
5736 moreunits = repsize / 4;
5737 }
5738 else {
5739 assert(PyUnicode_Check(rep));
5740 if (PyUnicode_READY(rep) < 0)
5741 goto error;
5742 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5743 if (!PyUnicode_IS_ASCII(rep)) {
5744 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005745 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005746 "surrogates not allowed");
5747 goto error;
5748 }
5749 }
5750
5751 /* four bytes are reserved for each surrogate */
5752 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005753 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005754 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005755 /* integer overflow */
5756 PyErr_NoMemory();
5757 goto error;
5758 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005759 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005761 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005762 }
5763
5764 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005765 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005766 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005769 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5770 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 }
5772
5773 Py_CLEAR(rep);
5774 }
5775
5776 /* Cut back to size actually needed. This is necessary for, for example,
5777 encoding of a string containing isolated surrogates and the 'ignore'
5778 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005779 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 if (nsize != PyBytes_GET_SIZE(v))
5781 _PyBytes_Resize(&v, nsize);
5782 Py_XDECREF(errorHandler);
5783 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005784 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005785 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 error:
5787 Py_XDECREF(rep);
5788 Py_XDECREF(errorHandler);
5789 Py_XDECREF(exc);
5790 Py_XDECREF(v);
5791 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005792}
5793
Alexander Belopolsky40018472011-02-26 01:02:56 +00005794PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005795PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5796 Py_ssize_t size,
5797 const char *errors,
5798 int byteorder)
5799{
5800 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005801 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005802 if (tmp == NULL)
5803 return NULL;
5804 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5805 Py_DECREF(tmp);
5806 return result;
5807}
5808
5809PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005810PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005811{
Victor Stinnerb960b342011-11-20 19:12:52 +01005812 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005813}
5814
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815/* --- UTF-16 Codec ------------------------------------------------------- */
5816
Tim Peters772747b2001-08-09 22:21:55 +00005817PyObject *
5818PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 Py_ssize_t size,
5820 const char *errors,
5821 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822{
Walter Dörwald69652032004-09-07 20:24:22 +00005823 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5824}
5825
5826PyObject *
5827PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 Py_ssize_t size,
5829 const char *errors,
5830 int *byteorder,
5831 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005834 Py_ssize_t startinpos;
5835 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005836 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005837 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005838 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005839 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005840 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 PyObject *errorHandler = NULL;
5842 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005843 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
Andy Lestere6be9b52020-02-11 20:28:35 -06005845 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005846 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
5848 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005849 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005851 /* Check for BOM marks (U+FEFF) in the input and adjust current
5852 byte order setting accordingly. In native mode, the leading BOM
5853 mark is skipped, in all other modes, it is copied to the output
5854 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005855 if (bo == 0 && size >= 2) {
5856 const Py_UCS4 bom = (q[1] << 8) | q[0];
5857 if (bom == 0xFEFF) {
5858 q += 2;
5859 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005861 else if (bom == 0xFFFE) {
5862 q += 2;
5863 bo = 1;
5864 }
5865 if (byteorder)
5866 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Antoine Pitrou63065d72012-05-15 23:48:04 +02005869 if (q == e) {
5870 if (consumed)
5871 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005872 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005873 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005874
Christian Heimes743e0cd2012-10-17 23:52:17 +02005875#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005876 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005877 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005878#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005879 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005880 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005881#endif
Tim Peters772747b2001-08-09 22:21:55 +00005882
Antoine Pitrou63065d72012-05-15 23:48:04 +02005883 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005884 character count normally. Error handler will take care of
5885 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005886 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005887 writer.min_length = (e - q + 1) / 2;
5888 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005889 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005890
Antoine Pitrou63065d72012-05-15 23:48:04 +02005891 while (1) {
5892 Py_UCS4 ch = 0;
5893 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005894 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005895 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005897 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005898 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005899 native_ordering);
5900 else
5901 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005902 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005903 native_ordering);
5904 } else if (kind == PyUnicode_2BYTE_KIND) {
5905 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005906 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005907 native_ordering);
5908 } else {
5909 assert(kind == PyUnicode_4BYTE_KIND);
5910 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005911 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005912 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005913 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915
Antoine Pitrou63065d72012-05-15 23:48:04 +02005916 switch (ch)
5917 {
5918 case 0:
5919 /* remaining byte at the end? (size should be even) */
5920 if (q == e || consumed)
5921 goto End;
5922 errmsg = "truncated data";
5923 startinpos = ((const char *)q) - starts;
5924 endinpos = ((const char *)e) - starts;
5925 break;
5926 /* The remaining input chars are ignored if the callback
5927 chooses to skip the input */
5928 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005929 q -= 2;
5930 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005931 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005932 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005933 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005934 endinpos = ((const char *)e) - starts;
5935 break;
5936 case 2:
5937 errmsg = "illegal encoding";
5938 startinpos = ((const char *)q) - 2 - starts;
5939 endinpos = startinpos + 2;
5940 break;
5941 case 3:
5942 errmsg = "illegal UTF-16 surrogate";
5943 startinpos = ((const char *)q) - 4 - starts;
5944 endinpos = startinpos + 2;
5945 break;
5946 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005947 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005948 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 continue;
5950 }
5951
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005952 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005953 errors,
5954 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005955 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005956 &starts,
5957 (const char **)&e,
5958 &startinpos,
5959 &endinpos,
5960 &exc,
5961 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005962 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 }
5965
Antoine Pitrou63065d72012-05-15 23:48:04 +02005966End:
Walter Dörwald69652032004-09-07 20:24:22 +00005967 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005969
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 Py_XDECREF(errorHandler);
5971 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005972 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005975 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 Py_XDECREF(errorHandler);
5977 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 return NULL;
5979}
5980
Tim Peters772747b2001-08-09 22:21:55 +00005981PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005982_PyUnicode_EncodeUTF16(PyObject *str,
5983 const char *errors,
5984 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005986 enum PyUnicode_Kind kind;
5987 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005989 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005990 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005991 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005992#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005993 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005994#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005995 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005996#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005997 const char *encoding;
5998 Py_ssize_t nsize, pos;
5999 PyObject *errorHandler = NULL;
6000 PyObject *exc = NULL;
6001 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006002
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006003 if (!PyUnicode_Check(str)) {
6004 PyErr_BadArgument();
6005 return NULL;
6006 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006007 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006008 return NULL;
6009 kind = PyUnicode_KIND(str);
6010 data = PyUnicode_DATA(str);
6011 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006012
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006014 if (kind == PyUnicode_4BYTE_KIND) {
6015 const Py_UCS4 *in = (const Py_UCS4 *)data;
6016 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006017 while (in < end) {
6018 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006019 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006020 }
6021 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006022 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006023 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006025 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006026 nsize = len + pairs + (byteorder == 0);
6027 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006028 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006032 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006033 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006034 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006035 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006036 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006037 }
6038 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006039 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006040 }
Tim Peters772747b2001-08-09 22:21:55 +00006041
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006042 if (kind == PyUnicode_1BYTE_KIND) {
6043 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6044 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006045 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006046
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006047 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006048 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006049 }
6050 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006051 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006052 }
6053 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006054 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006055 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006056
6057 pos = 0;
6058 while (pos < len) {
6059 Py_ssize_t repsize, moreunits;
6060
6061 if (kind == PyUnicode_2BYTE_KIND) {
6062 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6063 &out, native_ordering);
6064 }
6065 else {
6066 assert(kind == PyUnicode_4BYTE_KIND);
6067 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6068 &out, native_ordering);
6069 }
6070 if (pos == len)
6071 break;
6072
6073 rep = unicode_encode_call_errorhandler(
6074 errors, &errorHandler,
6075 encoding, "surrogates not allowed",
6076 str, &exc, pos, pos + 1, &pos);
6077 if (!rep)
6078 goto error;
6079
6080 if (PyBytes_Check(rep)) {
6081 repsize = PyBytes_GET_SIZE(rep);
6082 if (repsize & 1) {
6083 raise_encode_exception(&exc, encoding,
6084 str, pos - 1, pos,
6085 "surrogates not allowed");
6086 goto error;
6087 }
6088 moreunits = repsize / 2;
6089 }
6090 else {
6091 assert(PyUnicode_Check(rep));
6092 if (PyUnicode_READY(rep) < 0)
6093 goto error;
6094 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6095 if (!PyUnicode_IS_ASCII(rep)) {
6096 raise_encode_exception(&exc, encoding,
6097 str, pos - 1, pos,
6098 "surrogates not allowed");
6099 goto error;
6100 }
6101 }
6102
6103 /* two bytes are reserved for each surrogate */
6104 if (moreunits > 1) {
6105 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006106 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006107 /* integer overflow */
6108 PyErr_NoMemory();
6109 goto error;
6110 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006111 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006112 goto error;
6113 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6114 }
6115
6116 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006117 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006118 out += moreunits;
6119 } else /* rep is unicode */ {
6120 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6121 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6122 &out, native_ordering);
6123 }
6124
6125 Py_CLEAR(rep);
6126 }
6127
6128 /* Cut back to size actually needed. This is necessary for, for example,
6129 encoding of a string containing isolated surrogates and the 'ignore' handler
6130 is used. */
6131 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6132 if (nsize != PyBytes_GET_SIZE(v))
6133 _PyBytes_Resize(&v, nsize);
6134 Py_XDECREF(errorHandler);
6135 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006136 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006137 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006138 error:
6139 Py_XDECREF(rep);
6140 Py_XDECREF(errorHandler);
6141 Py_XDECREF(exc);
6142 Py_XDECREF(v);
6143 return NULL;
6144#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145}
6146
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6149 Py_ssize_t size,
6150 const char *errors,
6151 int byteorder)
6152{
6153 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006154 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 if (tmp == NULL)
6156 return NULL;
6157 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6158 Py_DECREF(tmp);
6159 return result;
6160}
6161
6162PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166}
6167
6168/* --- Unicode Escape Codec ----------------------------------------------- */
6169
Fredrik Lundh06d12682001-01-24 07:59:11 +00006170static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006171
Alexander Belopolsky40018472011-02-26 01:02:56 +00006172PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006173_PyUnicode_DecodeUnicodeEscape(const char *s,
6174 Py_ssize_t size,
6175 const char *errors,
6176 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006179 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 PyObject *errorHandler = NULL;
6182 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006183
Eric V. Smith42454af2016-10-31 09:22:08 -04006184 // so we can remember if we've seen an invalid escape char or not
6185 *first_invalid_escape = NULL;
6186
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006188 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 }
6190 /* Escaped strings will always be longer than the resulting
6191 Unicode string, so we start with size here and then reduce the
6192 length after conversion to the true value.
6193 (but if the error callback returns a long replacement string
6194 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006195 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 writer.min_length = size;
6197 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6198 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006199 }
6200
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 end = s + size;
6202 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006203 unsigned char c = (unsigned char) *s++;
6204 Py_UCS4 ch;
6205 int count;
6206 Py_ssize_t startinpos;
6207 Py_ssize_t endinpos;
6208 const char *message;
6209
6210#define WRITE_ASCII_CHAR(ch) \
6211 do { \
6212 assert(ch <= 127); \
6213 assert(writer.pos < writer.size); \
6214 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6215 } while(0)
6216
6217#define WRITE_CHAR(ch) \
6218 do { \
6219 if (ch <= writer.maxchar) { \
6220 assert(writer.pos < writer.size); \
6221 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6222 } \
6223 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6224 goto onError; \
6225 } \
6226 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227
6228 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006229 if (c != '\\') {
6230 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 continue;
6232 }
6233
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 if (s >= end) {
6237 message = "\\ at end of string";
6238 goto error;
6239 }
6240 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006241
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006243 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006246 case '\n': continue;
6247 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6248 case '\'': WRITE_ASCII_CHAR('\''); continue;
6249 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6250 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006251 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6253 case 't': WRITE_ASCII_CHAR('\t'); continue;
6254 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6255 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006256 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006257 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006258 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006259 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 case '0': case '1': case '2': case '3':
6263 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006265 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 ch = (ch<<3) + *s++ - '0';
6267 if (s < end && '0' <= *s && *s <= '7') {
6268 ch = (ch<<3) + *s++ - '0';
6269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006271 WRITE_CHAR(ch);
6272 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 /* hex escapes */
6275 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006277 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006278 message = "truncated \\xXX escape";
6279 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006284 message = "truncated \\uXXXX escape";
6285 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006288 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006289 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006290 message = "truncated \\UXXXXXXXX escape";
6291 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006292 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006293 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 ch <<= 4;
6295 if (c >= '0' && c <= '9') {
6296 ch += c - '0';
6297 }
6298 else if (c >= 'a' && c <= 'f') {
6299 ch += c - ('a' - 10);
6300 }
6301 else if (c >= 'A' && c <= 'F') {
6302 ch += c - ('A' - 10);
6303 }
6304 else {
6305 break;
6306 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006307 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006308 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006309 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006310 }
6311
6312 /* when we get here, ch is a 32-bit unicode character */
6313 if (ch > MAX_UNICODE) {
6314 message = "illegal Unicode character";
6315 goto error;
6316 }
6317
6318 WRITE_CHAR(ch);
6319 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006322 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006323 if (ucnhash_CAPI == NULL) {
6324 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006325 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6326 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006327 if (ucnhash_CAPI == NULL) {
6328 PyErr_SetString(
6329 PyExc_UnicodeError,
6330 "\\N escapes not supported (can't load unicodedata module)"
6331 );
6332 goto onError;
6333 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006334 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006335
6336 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006337 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 const char *start = ++s;
6339 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006340 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006342 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 namelen = s - start;
6344 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006345 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006346 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 ch = 0xffffffff; /* in case 'getcode' messes up */
6348 if (namelen <= INT_MAX &&
6349 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6350 &ch, 0)) {
6351 assert(ch <= MAX_UNICODE);
6352 WRITE_CHAR(ch);
6353 continue;
6354 }
6355 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006356 }
6357 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006358 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006359
6360 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006361 if (*first_invalid_escape == NULL) {
6362 *first_invalid_escape = s-1; /* Back up one char, since we've
6363 already incremented s. */
6364 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006365 WRITE_ASCII_CHAR('\\');
6366 WRITE_CHAR(c);
6367 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006369
6370 error:
6371 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006372 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006373 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006374 errors, &errorHandler,
6375 "unicodeescape", message,
6376 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006378 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006380 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006381
6382#undef WRITE_ASCII_CHAR
6383#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006385
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006386 Py_XDECREF(errorHandler);
6387 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006388 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006389
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006391 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 Py_XDECREF(errorHandler);
6393 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 return NULL;
6395}
6396
Eric V. Smith42454af2016-10-31 09:22:08 -04006397PyObject *
6398PyUnicode_DecodeUnicodeEscape(const char *s,
6399 Py_ssize_t size,
6400 const char *errors)
6401{
6402 const char *first_invalid_escape;
6403 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6404 &first_invalid_escape);
6405 if (result == NULL)
6406 return NULL;
6407 if (first_invalid_escape != NULL) {
6408 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6409 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006410 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006411 Py_DECREF(result);
6412 return NULL;
6413 }
6414 }
6415 return result;
6416}
6417
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006418/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
Alexander Belopolsky40018472011-02-26 01:02:56 +00006420PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429
Ezio Melottie7f90372012-10-05 03:33:31 +03006430 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006431 escape.
6432
Ezio Melottie7f90372012-10-05 03:33:31 +03006433 For UCS1 strings it's '\xxx', 4 bytes per source character.
6434 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6435 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006436 */
6437
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006438 if (!PyUnicode_Check(unicode)) {
6439 PyErr_BadArgument();
6440 return NULL;
6441 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006442 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006443 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006444 }
Victor Stinner358af132015-10-12 22:36:57 +02006445
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 if (len == 0) {
6448 return PyBytes_FromStringAndSize(NULL, 0);
6449 }
6450
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006451 kind = PyUnicode_KIND(unicode);
6452 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6454 bytes, and 1 byte characters 4. */
6455 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006456 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 return PyErr_NoMemory();
6458 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006459 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 if (repr == NULL) {
6461 return NULL;
6462 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006463
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006465 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006466 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006467
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 /* U+0000-U+00ff range */
6469 if (ch < 0x100) {
6470 if (ch >= ' ' && ch < 127) {
6471 if (ch != '\\') {
6472 /* Copy printable US ASCII as-is */
6473 *p++ = (char) ch;
6474 }
6475 /* Escape backslashes */
6476 else {
6477 *p++ = '\\';
6478 *p++ = '\\';
6479 }
6480 }
Victor Stinner358af132015-10-12 22:36:57 +02006481
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 /* Map special whitespace to '\t', \n', '\r' */
6483 else if (ch == '\t') {
6484 *p++ = '\\';
6485 *p++ = 't';
6486 }
6487 else if (ch == '\n') {
6488 *p++ = '\\';
6489 *p++ = 'n';
6490 }
6491 else if (ch == '\r') {
6492 *p++ = '\\';
6493 *p++ = 'r';
6494 }
6495
6496 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6497 else {
6498 *p++ = '\\';
6499 *p++ = 'x';
6500 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6501 *p++ = Py_hexdigits[ch & 0x000F];
6502 }
Tim Petersced69f82003-09-16 20:30:58 +00006503 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006504 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 *p++ = '\\';
6507 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006508 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6509 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6510 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6511 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006513 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6514 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006515
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 /* Make sure that the first two digits are zero */
6517 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006518 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 *p++ = 'U';
6520 *p++ = '0';
6521 *p++ = '0';
6522 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6523 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6524 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6525 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6526 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6527 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 assert(p - PyBytes_AS_STRING(repr) > 0);
6532 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6533 return NULL;
6534 }
6535 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536}
6537
Alexander Belopolsky40018472011-02-26 01:02:56 +00006538PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006539PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6540 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006542 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006543 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006544 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006546 }
6547
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006548 result = PyUnicode_AsUnicodeEscapeString(tmp);
6549 Py_DECREF(tmp);
6550 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551}
6552
6553/* --- Raw Unicode Escape Codec ------------------------------------------- */
6554
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555PyObject *
6556PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006557 Py_ssize_t size,
6558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006560 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006561 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563 PyObject *errorHandler = NULL;
6564 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006565
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006567 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006568 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006569
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 /* Escaped strings will always be longer than the resulting
6571 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572 length after conversion to the true value. (But decoding error
6573 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006574 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006575 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006576 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6577 goto onError;
6578 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006579
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 end = s + size;
6581 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006582 unsigned char c = (unsigned char) *s++;
6583 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006584 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 Py_ssize_t startinpos;
6586 Py_ssize_t endinpos;
6587 const char *message;
6588
6589#define WRITE_CHAR(ch) \
6590 do { \
6591 if (ch <= writer.maxchar) { \
6592 assert(writer.pos < writer.size); \
6593 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6594 } \
6595 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6596 goto onError; \
6597 } \
6598 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006601 if (c != '\\' || s >= end) {
6602 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006604 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006605
Victor Stinner62ec3312016-09-06 17:04:34 -07006606 c = (unsigned char) *s++;
6607 if (c == 'u') {
6608 count = 4;
6609 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006611 else if (c == 'U') {
6612 count = 8;
6613 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006614 }
6615 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006616 assert(writer.pos < writer.size);
6617 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6618 WRITE_CHAR(c);
6619 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006620 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006621 startinpos = s - starts - 2;
6622
6623 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6624 for (ch = 0; count && s < end; ++s, --count) {
6625 c = (unsigned char)*s;
6626 ch <<= 4;
6627 if (c >= '0' && c <= '9') {
6628 ch += c - '0';
6629 }
6630 else if (c >= 'a' && c <= 'f') {
6631 ch += c - ('a' - 10);
6632 }
6633 else if (c >= 'A' && c <= 'F') {
6634 ch += c - ('A' - 10);
6635 }
6636 else {
6637 break;
6638 }
6639 }
6640 if (!count) {
6641 if (ch <= MAX_UNICODE) {
6642 WRITE_CHAR(ch);
6643 continue;
6644 }
6645 message = "\\Uxxxxxxxx out of range";
6646 }
6647
6648 endinpos = s-starts;
6649 writer.min_length = end - s + writer.pos;
6650 if (unicode_decode_call_errorhandler_writer(
6651 errors, &errorHandler,
6652 "rawunicodeescape", message,
6653 &starts, &end, &startinpos, &endinpos, &exc, &s,
6654 &writer)) {
6655 goto onError;
6656 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006657 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006658
6659#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 Py_XDECREF(errorHandler);
6662 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006663 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006664
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006666 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 Py_XDECREF(errorHandler);
6668 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671}
6672
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006673
Alexander Belopolsky40018472011-02-26 01:02:56 +00006674PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006675PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676{
Victor Stinner62ec3312016-09-06 17:04:34 -07006677 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006679 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006680 int kind;
6681 void *data;
6682 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006684 if (!PyUnicode_Check(unicode)) {
6685 PyErr_BadArgument();
6686 return NULL;
6687 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006688 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006689 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006690 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006691 kind = PyUnicode_KIND(unicode);
6692 data = PyUnicode_DATA(unicode);
6693 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006694 if (kind == PyUnicode_1BYTE_KIND) {
6695 return PyBytes_FromStringAndSize(data, len);
6696 }
Victor Stinner0e368262011-11-10 20:12:49 +01006697
Victor Stinner62ec3312016-09-06 17:04:34 -07006698 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6699 bytes, and 1 byte characters 4. */
6700 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006701
Victor Stinner62ec3312016-09-06 17:04:34 -07006702 if (len > PY_SSIZE_T_MAX / expandsize) {
6703 return PyErr_NoMemory();
6704 }
6705 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6706 if (repr == NULL) {
6707 return NULL;
6708 }
6709 if (len == 0) {
6710 return repr;
6711 }
6712
6713 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714 for (pos = 0; pos < len; pos++) {
6715 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006716
Victor Stinner62ec3312016-09-06 17:04:34 -07006717 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6718 if (ch < 0x100) {
6719 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006720 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006721 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006722 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 *p++ = '\\';
6724 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006725 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6726 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6727 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6728 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006730 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6731 else {
6732 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6733 *p++ = '\\';
6734 *p++ = 'U';
6735 *p++ = '0';
6736 *p++ = '0';
6737 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6738 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6739 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6740 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6741 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6742 *p++ = Py_hexdigits[ch & 15];
6743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006745
Victor Stinner62ec3312016-09-06 17:04:34 -07006746 assert(p > PyBytes_AS_STRING(repr));
6747 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6748 return NULL;
6749 }
6750 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751}
6752
Alexander Belopolsky40018472011-02-26 01:02:56 +00006753PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006754PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6755 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006757 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006758 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006759 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006760 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006761 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6762 Py_DECREF(tmp);
6763 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764}
6765
6766/* --- Latin-1 Codec ------------------------------------------------------ */
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
6769PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006770 Py_ssize_t size,
6771 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006774 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006778static void
6779make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006780 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006781 PyObject *unicode,
6782 Py_ssize_t startpos, Py_ssize_t endpos,
6783 const char *reason)
6784{
6785 if (*exceptionObject == NULL) {
6786 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006787 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006788 encoding, unicode, startpos, endpos, reason);
6789 }
6790 else {
6791 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6792 goto onError;
6793 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6794 goto onError;
6795 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6796 goto onError;
6797 return;
6798 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006799 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006800 }
6801}
6802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006803/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006804static void
6805raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006806 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006807 PyObject *unicode,
6808 Py_ssize_t startpos, Py_ssize_t endpos,
6809 const char *reason)
6810{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006811 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006812 encoding, unicode, startpos, endpos, reason);
6813 if (*exceptionObject != NULL)
6814 PyCodec_StrictErrors(*exceptionObject);
6815}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816
6817/* error handling callback helper:
6818 build arguments, call the callback and check the arguments,
6819 put the result into newpos and return the replacement string, which
6820 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006821static PyObject *
6822unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006823 PyObject **errorHandler,
6824 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006825 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006826 Py_ssize_t startpos, Py_ssize_t endpos,
6827 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006829 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006830 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831 PyObject *restuple;
6832 PyObject *resunicode;
6833
6834 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006836 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838 }
6839
Benjamin Petersonbac79492012-01-14 13:34:47 -05006840 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006841 return NULL;
6842 len = PyUnicode_GET_LENGTH(unicode);
6843
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006844 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006845 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848
Petr Viktorinffd97532020-02-11 17:46:57 +01006849 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006850 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006853 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 Py_DECREF(restuple);
6855 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006856 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006857 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 &resunicode, newpos)) {
6859 Py_DECREF(restuple);
6860 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006862 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6863 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6864 Py_DECREF(restuple);
6865 return NULL;
6866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006868 *newpos = len + *newpos;
6869 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006870 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 Py_DECREF(restuple);
6872 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006873 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 Py_INCREF(resunicode);
6875 Py_DECREF(restuple);
6876 return resunicode;
6877}
6878
Alexander Belopolsky40018472011-02-26 01:02:56 +00006879static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006880unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006881 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006882 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006883{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006884 /* input state */
6885 Py_ssize_t pos=0, size;
6886 int kind;
6887 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006888 /* pointer into the output */
6889 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006890 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6891 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006892 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006894 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006895 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006896 /* output object */
6897 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898
Benjamin Petersonbac79492012-01-14 13:34:47 -05006899 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 return NULL;
6901 size = PyUnicode_GET_LENGTH(unicode);
6902 kind = PyUnicode_KIND(unicode);
6903 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904 /* allocate enough for a simple encoding without
6905 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006906 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006907 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006908
6909 _PyBytesWriter_Init(&writer);
6910 str = _PyBytesWriter_Alloc(&writer, size);
6911 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006912 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006915 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006918 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006920 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006921 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006922 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006924 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006926 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006927 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006929
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006930 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006932
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006933 /* Only overallocate the buffer if it's not the last write */
6934 writer.overallocate = (collend < size);
6935
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006937 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006938 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006939
6940 switch (error_handler) {
6941 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006942 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006944
6945 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006946 memset(str, '?', collend - collstart);
6947 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006948 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006949 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006950 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 break;
Victor Stinner50149202015-09-22 00:26:54 +02006952
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006953 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006954 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006955 writer.min_size -= (collend - collstart);
6956 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006957 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006958 if (str == NULL)
6959 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006960 pos = collend;
6961 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006962
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006963 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006964 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006965 writer.min_size -= (collend - collstart);
6966 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006967 unicode, collstart, collend);
6968 if (str == NULL)
6969 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006970 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 break;
Victor Stinner50149202015-09-22 00:26:54 +02006972
Victor Stinnerc3713e92015-09-29 12:32:13 +02006973 case _Py_ERROR_SURROGATEESCAPE:
6974 for (i = collstart; i < collend; ++i) {
6975 ch = PyUnicode_READ(kind, data, i);
6976 if (ch < 0xdc80 || 0xdcff < ch) {
6977 /* Not a UTF-8b surrogate */
6978 break;
6979 }
6980 *str++ = (char)(ch - 0xdc00);
6981 ++pos;
6982 }
6983 if (i >= collend)
6984 break;
6985 collstart = pos;
6986 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006987 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006988
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006990 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6991 encoding, reason, unicode, &exc,
6992 collstart, collend, &newpos);
6993 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006995
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006996 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006997 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006998
Victor Stinner6bd525b2015-10-09 13:10:05 +02006999 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007000 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007001 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007002 PyBytes_AS_STRING(rep),
7003 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007004 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007005 else {
7006 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007007
Victor Stinner6bd525b2015-10-09 13:10:05 +02007008 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007010
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007011 if (limit == 256 ?
7012 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7013 !PyUnicode_IS_ASCII(rep))
7014 {
7015 /* Not all characters are smaller than limit */
7016 raise_encode_exception(&exc, encoding, unicode,
7017 collstart, collend, reason);
7018 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007020 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7021 str = _PyBytesWriter_WriteBytes(&writer, str,
7022 PyUnicode_DATA(rep),
7023 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007025 if (str == NULL)
7026 goto onError;
7027
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007029 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007030 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007031
7032 /* If overallocation was disabled, ensure that it was the last
7033 write. Otherwise, we missed an optimization */
7034 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007035 }
7036 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007037
Victor Stinner50149202015-09-22 00:26:54 +02007038 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007039 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007040 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007041
7042 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007043 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007044 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007045 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007046 Py_XDECREF(exc);
7047 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048}
7049
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007050/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007051PyObject *
7052PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007053 Py_ssize_t size,
7054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007056 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007057 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007058 if (unicode == NULL)
7059 return NULL;
7060 result = unicode_encode_ucs1(unicode, errors, 256);
7061 Py_DECREF(unicode);
7062 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063}
7064
Alexander Belopolsky40018472011-02-26 01:02:56 +00007065PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007066_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067{
7068 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 PyErr_BadArgument();
7070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007072 if (PyUnicode_READY(unicode) == -1)
7073 return NULL;
7074 /* Fast path: if it is a one-byte string, construct
7075 bytes object directly. */
7076 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7077 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7078 PyUnicode_GET_LENGTH(unicode));
7079 /* Non-Latin-1 characters present. Defer to above function to
7080 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007082}
7083
7084PyObject*
7085PyUnicode_AsLatin1String(PyObject *unicode)
7086{
7087 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088}
7089
7090/* --- 7-bit ASCII Codec -------------------------------------------------- */
7091
Alexander Belopolsky40018472011-02-26 01:02:56 +00007092PyObject *
7093PyUnicode_DecodeASCII(const char *s,
7094 Py_ssize_t size,
7095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007097 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007098 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007099 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007100 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007101 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007102
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007104 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007105
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007107 if (size == 1 && (unsigned char)s[0] < 128)
7108 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007109
Inada Naoki770847a2019-06-24 12:30:24 +09007110 // Shortcut for simple case
7111 PyObject *u = PyUnicode_New(size, 127);
7112 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007113 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007114 }
7115 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7116 if (outpos == size) {
7117 return u;
7118 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007119
Inada Naoki770847a2019-06-24 12:30:24 +09007120 _PyUnicodeWriter writer;
7121 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007122 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007123
Inada Naoki770847a2019-06-24 12:30:24 +09007124 s += outpos;
7125 int kind = writer.kind;
7126 void *data = writer.data;
7127 Py_ssize_t startinpos, endinpos;
7128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007129 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007130 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007132 PyUnicode_WRITE(kind, data, writer.pos, c);
7133 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007135 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007137
7138 /* byte outsize range 0x00..0x7f: call the error handler */
7139
7140 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007141 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007142
7143 switch (error_handler)
7144 {
7145 case _Py_ERROR_REPLACE:
7146 case _Py_ERROR_SURROGATEESCAPE:
7147 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007148 but we may switch to UCS2 at the first write */
7149 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7150 goto onError;
7151 kind = writer.kind;
7152 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007153
7154 if (error_handler == _Py_ERROR_REPLACE)
7155 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7156 else
7157 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7158 writer.pos++;
7159 ++s;
7160 break;
7161
7162 case _Py_ERROR_IGNORE:
7163 ++s;
7164 break;
7165
7166 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 startinpos = s-starts;
7168 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007169 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007170 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 "ascii", "ordinal not in range(128)",
7172 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007173 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007175 kind = writer.kind;
7176 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007179 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007181 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007182
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007184 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007185 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007186 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187 return NULL;
7188}
7189
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007190/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007191PyObject *
7192PyUnicode_EncodeASCII(const Py_UNICODE *p,
7193 Py_ssize_t size,
7194 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007196 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007197 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007198 if (unicode == NULL)
7199 return NULL;
7200 result = unicode_encode_ucs1(unicode, errors, 128);
7201 Py_DECREF(unicode);
7202 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203}
7204
Alexander Belopolsky40018472011-02-26 01:02:56 +00007205PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007206_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207{
7208 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 PyErr_BadArgument();
7210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007212 if (PyUnicode_READY(unicode) == -1)
7213 return NULL;
7214 /* Fast path: if it is an ASCII-only string, construct bytes object
7215 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007216 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007217 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7218 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007219 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007220}
7221
7222PyObject *
7223PyUnicode_AsASCIIString(PyObject *unicode)
7224{
7225 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
Steve Dowercc16be82016-09-08 10:35:16 -07007228#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007229
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007230/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007231
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007232#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233#define NEED_RETRY
7234#endif
7235
Steve Dower7ebdda02019-08-21 16:22:33 -07007236/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7237 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7238 both cases also and avoids partial characters overrunning the
7239 length limit in MultiByteToWideChar on Windows */
7240#define DECODING_CHUNK_SIZE (INT_MAX/4)
7241
Victor Stinner3a50e702011-10-18 21:21:00 +02007242#ifndef WC_ERR_INVALID_CHARS
7243# define WC_ERR_INVALID_CHARS 0x0080
7244#endif
7245
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007246static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007247code_page_name(UINT code_page, PyObject **obj)
7248{
7249 *obj = NULL;
7250 if (code_page == CP_ACP)
7251 return "mbcs";
7252 if (code_page == CP_UTF7)
7253 return "CP_UTF7";
7254 if (code_page == CP_UTF8)
7255 return "CP_UTF8";
7256
7257 *obj = PyBytes_FromFormat("cp%u", code_page);
7258 if (*obj == NULL)
7259 return NULL;
7260 return PyBytes_AS_STRING(*obj);
7261}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263static DWORD
7264decode_code_page_flags(UINT code_page)
7265{
7266 if (code_page == CP_UTF7) {
7267 /* The CP_UTF7 decoder only supports flags=0 */
7268 return 0;
7269 }
7270 else
7271 return MB_ERR_INVALID_CHARS;
7272}
7273
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007274/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 * Decode a byte string from a Windows code page into unicode object in strict
7276 * mode.
7277 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007278 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7279 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007280 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007281static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007282decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007283 wchar_t **buf,
7284 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 const char *in,
7286 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007288 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007289 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291
7292 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007294 while ((outsize = MultiByteToWideChar(code_page, flags,
7295 in, insize, NULL, 0)) <= 0)
7296 {
7297 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7298 goto error;
7299 }
7300 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7301 flags = 0;
7302 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007303
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007304 /* Extend a wchar_t* buffer */
7305 Py_ssize_t n = *bufsize; /* Get the current length */
7306 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7307 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007309 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007310
7311 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7313 if (outsize <= 0)
7314 goto error;
7315 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007316
Victor Stinner3a50e702011-10-18 21:21:00 +02007317error:
7318 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7319 return -2;
7320 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007321 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007322}
7323
Victor Stinner3a50e702011-10-18 21:21:00 +02007324/*
7325 * Decode a byte string from a code page into unicode object with an error
7326 * handler.
7327 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007328 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 * UnicodeDecodeError exception and returns -1 on error.
7330 */
7331static int
7332decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007333 wchar_t **buf,
7334 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007335 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007336 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007337{
7338 const char *startin = in;
7339 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007340 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 /* Ideally, we should get reason from FormatMessage. This is the Windows
7342 2000 English version of the message. */
7343 const char *reason = "No mapping for the Unicode character exists "
7344 "in the target code page.";
7345 /* each step cannot decode more than 1 character, but a character can be
7346 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007347 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007348 int insize;
7349 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 PyObject *errorHandler = NULL;
7351 PyObject *exc = NULL;
7352 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007353 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 DWORD err;
7355 int ret = -1;
7356
7357 assert(size > 0);
7358
7359 encoding = code_page_name(code_page, &encoding_obj);
7360 if (encoding == NULL)
7361 return -1;
7362
Victor Stinner7d00cc12014-03-17 23:08:06 +01007363 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7365 UnicodeDecodeError. */
7366 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7367 if (exc != NULL) {
7368 PyCodec_StrictErrors(exc);
7369 Py_CLEAR(exc);
7370 }
7371 goto error;
7372 }
7373
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007374 /* Extend a wchar_t* buffer */
7375 Py_ssize_t n = *bufsize; /* Get the current length */
7376 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7377 PyErr_NoMemory();
7378 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007380 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7381 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007383 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007384
7385 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 while (in < endin)
7387 {
7388 /* Decode a character */
7389 insize = 1;
7390 do
7391 {
7392 outsize = MultiByteToWideChar(code_page, flags,
7393 in, insize,
7394 buffer, Py_ARRAY_LENGTH(buffer));
7395 if (outsize > 0)
7396 break;
7397 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007398 if (err == ERROR_INVALID_FLAGS && flags) {
7399 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7400 flags = 0;
7401 continue;
7402 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 if (err != ERROR_NO_UNICODE_TRANSLATION
7404 && err != ERROR_INSUFFICIENT_BUFFER)
7405 {
7406 PyErr_SetFromWindowsErr(0);
7407 goto error;
7408 }
7409 insize++;
7410 }
7411 /* 4=maximum length of a UTF-8 sequence */
7412 while (insize <= 4 && (in + insize) <= endin);
7413
7414 if (outsize <= 0) {
7415 Py_ssize_t startinpos, endinpos, outpos;
7416
Victor Stinner7d00cc12014-03-17 23:08:06 +01007417 /* last character in partial decode? */
7418 if (in + insize >= endin && !final)
7419 break;
7420
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 startinpos = in - startin;
7422 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007423 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007424 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 errors, &errorHandler,
7426 encoding, reason,
7427 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007428 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 {
7430 goto error;
7431 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007432 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 }
7434 else {
7435 in += insize;
7436 memcpy(out, buffer, outsize * sizeof(wchar_t));
7437 out += outsize;
7438 }
7439 }
7440
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007441 /* Shrink the buffer */
7442 assert(out - *buf <= *bufsize);
7443 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007444 /* (in - startin) <= size and size is an int */
7445 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007446
7447error:
7448 Py_XDECREF(encoding_obj);
7449 Py_XDECREF(errorHandler);
7450 Py_XDECREF(exc);
7451 return ret;
7452}
7453
Victor Stinner3a50e702011-10-18 21:21:00 +02007454static PyObject *
7455decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007456 const char *s, Py_ssize_t size,
7457 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007459 wchar_t *buf = NULL;
7460 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007461 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 if (code_page < 0) {
7464 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7465 return NULL;
7466 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007467 if (size < 0) {
7468 PyErr_BadInternalCall();
7469 return NULL;
7470 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007471
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474
Victor Stinner76a31a62011-11-04 00:05:13 +01007475 do
7476 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007478 if (size > DECODING_CHUNK_SIZE) {
7479 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 final = 0;
7481 done = 0;
7482 }
7483 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 {
7486 chunk_size = (int)size;
7487 final = (consumed == NULL);
7488 done = 1;
7489 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490
Victor Stinner76a31a62011-11-04 00:05:13 +01007491 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007492 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007493 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007494 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007495 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007497 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007498 s, chunk_size);
7499 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007500 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007502 errors, final);
7503 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007504
7505 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007506 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007507 return NULL;
7508 }
7509
7510 if (consumed)
7511 *consumed += converted;
7512
7513 s += converted;
7514 size -= converted;
7515 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007516
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007517 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7518 PyMem_Free(buf);
7519 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007520}
7521
Alexander Belopolsky40018472011-02-26 01:02:56 +00007522PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007523PyUnicode_DecodeCodePageStateful(int code_page,
7524 const char *s,
7525 Py_ssize_t size,
7526 const char *errors,
7527 Py_ssize_t *consumed)
7528{
7529 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7530}
7531
7532PyObject *
7533PyUnicode_DecodeMBCSStateful(const char *s,
7534 Py_ssize_t size,
7535 const char *errors,
7536 Py_ssize_t *consumed)
7537{
7538 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7539}
7540
7541PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007542PyUnicode_DecodeMBCS(const char *s,
7543 Py_ssize_t size,
7544 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007545{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7547}
7548
Victor Stinner3a50e702011-10-18 21:21:00 +02007549static DWORD
7550encode_code_page_flags(UINT code_page, const char *errors)
7551{
7552 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007553 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 }
7555 else if (code_page == CP_UTF7) {
7556 /* CP_UTF7 only supports flags=0 */
7557 return 0;
7558 }
7559 else {
7560 if (errors != NULL && strcmp(errors, "replace") == 0)
7561 return 0;
7562 else
7563 return WC_NO_BEST_FIT_CHARS;
7564 }
7565}
7566
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007567/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 * Encode a Unicode string to a Windows code page into a byte string in strict
7569 * mode.
7570 *
7571 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007572 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007573 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007574static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007575encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007576 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007578{
Victor Stinner554f3f02010-06-16 23:33:54 +00007579 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 BOOL *pusedDefaultChar = &usedDefaultChar;
7581 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007582 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007583 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 const DWORD flags = encode_code_page_flags(code_page, NULL);
7585 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 /* Create a substring so that we can get the UTF-16 representation
7587 of just the slice under consideration. */
7588 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007589
Martin v. Löwis3d325192011-11-04 18:23:06 +01007590 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007591
Victor Stinner3a50e702011-10-18 21:21:00 +02007592 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007593 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007595 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007596
Victor Stinner2fc507f2011-11-04 20:06:39 +01007597 substring = PyUnicode_Substring(unicode, offset, offset+len);
7598 if (substring == NULL)
7599 return -1;
7600 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7601 if (p == NULL) {
7602 Py_DECREF(substring);
7603 return -1;
7604 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007605 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007606
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007607 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007608 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007609 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 NULL, 0,
7611 NULL, pusedDefaultChar);
7612 if (outsize <= 0)
7613 goto error;
7614 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007615 if (pusedDefaultChar && *pusedDefaultChar) {
7616 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007618 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007619
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007623 if (*outbytes == NULL) {
7624 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007626 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007628 }
7629 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 const Py_ssize_t n = PyBytes_Size(*outbytes);
7632 if (outsize > PY_SSIZE_T_MAX - n) {
7633 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007634 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007637 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7638 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007640 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642 }
7643
7644 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007646 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 out, outsize,
7648 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007649 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 if (outsize <= 0)
7651 goto error;
7652 if (pusedDefaultChar && *pusedDefaultChar)
7653 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007654 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007655
Victor Stinner3a50e702011-10-18 21:21:00 +02007656error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007657 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007658 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7659 return -2;
7660 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007661 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007662}
7663
Victor Stinner3a50e702011-10-18 21:21:00 +02007664/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007665 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 * error handler.
7667 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007668 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 * -1 on other error.
7670 */
7671static int
7672encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007673 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007675{
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007677 Py_ssize_t pos = unicode_offset;
7678 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007679 /* Ideally, we should get reason from FormatMessage. This is the Windows
7680 2000 English version of the message. */
7681 const char *reason = "invalid character";
7682 /* 4=maximum length of a UTF-8 sequence */
7683 char buffer[4];
7684 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7685 Py_ssize_t outsize;
7686 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 PyObject *errorHandler = NULL;
7688 PyObject *exc = NULL;
7689 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007690 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 PyObject *rep;
7693 int ret = -1;
7694
7695 assert(insize > 0);
7696
7697 encoding = code_page_name(code_page, &encoding_obj);
7698 if (encoding == NULL)
7699 return -1;
7700
7701 if (errors == NULL || strcmp(errors, "strict") == 0) {
7702 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7703 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007704 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007705 if (exc != NULL) {
7706 PyCodec_StrictErrors(exc);
7707 Py_DECREF(exc);
7708 }
7709 Py_XDECREF(encoding_obj);
7710 return -1;
7711 }
7712
7713 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7714 pusedDefaultChar = &usedDefaultChar;
7715 else
7716 pusedDefaultChar = NULL;
7717
7718 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7719 PyErr_NoMemory();
7720 goto error;
7721 }
7722 outsize = insize * Py_ARRAY_LENGTH(buffer);
7723
7724 if (*outbytes == NULL) {
7725 /* Create string object */
7726 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7727 if (*outbytes == NULL)
7728 goto error;
7729 out = PyBytes_AS_STRING(*outbytes);
7730 }
7731 else {
7732 /* Extend string object */
7733 Py_ssize_t n = PyBytes_Size(*outbytes);
7734 if (n > PY_SSIZE_T_MAX - outsize) {
7735 PyErr_NoMemory();
7736 goto error;
7737 }
7738 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7739 goto error;
7740 out = PyBytes_AS_STRING(*outbytes) + n;
7741 }
7742
7743 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007746 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7747 wchar_t chars[2];
7748 int charsize;
7749 if (ch < 0x10000) {
7750 chars[0] = (wchar_t)ch;
7751 charsize = 1;
7752 }
7753 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007754 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7755 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007756 charsize = 2;
7757 }
7758
Victor Stinner3a50e702011-10-18 21:21:00 +02007759 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007760 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007761 buffer, Py_ARRAY_LENGTH(buffer),
7762 NULL, pusedDefaultChar);
7763 if (outsize > 0) {
7764 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7765 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007766 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007767 memcpy(out, buffer, outsize);
7768 out += outsize;
7769 continue;
7770 }
7771 }
7772 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7773 PyErr_SetFromWindowsErr(0);
7774 goto error;
7775 }
7776
Victor Stinner3a50e702011-10-18 21:21:00 +02007777 rep = unicode_encode_call_errorhandler(
7778 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007779 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007780 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007781 if (rep == NULL)
7782 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007783 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007784
7785 if (PyBytes_Check(rep)) {
7786 outsize = PyBytes_GET_SIZE(rep);
7787 if (outsize != 1) {
7788 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7789 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7790 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7791 Py_DECREF(rep);
7792 goto error;
7793 }
7794 out = PyBytes_AS_STRING(*outbytes) + offset;
7795 }
7796 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7797 out += outsize;
7798 }
7799 else {
7800 Py_ssize_t i;
7801 enum PyUnicode_Kind kind;
7802 void *data;
7803
Benjamin Petersonbac79492012-01-14 13:34:47 -05007804 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007805 Py_DECREF(rep);
7806 goto error;
7807 }
7808
7809 outsize = PyUnicode_GET_LENGTH(rep);
7810 if (outsize != 1) {
7811 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7812 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7813 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7814 Py_DECREF(rep);
7815 goto error;
7816 }
7817 out = PyBytes_AS_STRING(*outbytes) + offset;
7818 }
7819 kind = PyUnicode_KIND(rep);
7820 data = PyUnicode_DATA(rep);
7821 for (i=0; i < outsize; i++) {
7822 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7823 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007824 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007825 encoding, unicode,
7826 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007827 "unable to encode error handler result to ASCII");
7828 Py_DECREF(rep);
7829 goto error;
7830 }
7831 *out = (unsigned char)ch;
7832 out++;
7833 }
7834 }
7835 Py_DECREF(rep);
7836 }
7837 /* write a NUL byte */
7838 *out = 0;
7839 outsize = out - PyBytes_AS_STRING(*outbytes);
7840 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7841 if (_PyBytes_Resize(outbytes, outsize) < 0)
7842 goto error;
7843 ret = 0;
7844
7845error:
7846 Py_XDECREF(encoding_obj);
7847 Py_XDECREF(errorHandler);
7848 Py_XDECREF(exc);
7849 return ret;
7850}
7851
Victor Stinner3a50e702011-10-18 21:21:00 +02007852static PyObject *
7853encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007854 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007855 const char *errors)
7856{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007857 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007858 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007859 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007860 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007861
Victor Stinner29dacf22015-01-26 16:41:32 +01007862 if (!PyUnicode_Check(unicode)) {
7863 PyErr_BadArgument();
7864 return NULL;
7865 }
7866
Benjamin Petersonbac79492012-01-14 13:34:47 -05007867 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007868 return NULL;
7869 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007870
Victor Stinner3a50e702011-10-18 21:21:00 +02007871 if (code_page < 0) {
7872 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7873 return NULL;
7874 }
7875
Martin v. Löwis3d325192011-11-04 18:23:06 +01007876 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007877 return PyBytes_FromStringAndSize(NULL, 0);
7878
Victor Stinner7581cef2011-11-03 22:32:33 +01007879 offset = 0;
7880 do
7881 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007882#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007883 if (len > DECODING_CHUNK_SIZE) {
7884 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007885 done = 0;
7886 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007887 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007888#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007889 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007890 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007891 done = 1;
7892 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007893
Victor Stinner76a31a62011-11-04 00:05:13 +01007894 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007895 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007896 errors);
7897 if (ret == -2)
7898 ret = encode_code_page_errors(code_page, &outbytes,
7899 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007900 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007901 if (ret < 0) {
7902 Py_XDECREF(outbytes);
7903 return NULL;
7904 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007905
Victor Stinner7581cef2011-11-03 22:32:33 +01007906 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007907 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007908 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007909
Victor Stinner3a50e702011-10-18 21:21:00 +02007910 return outbytes;
7911}
7912
7913PyObject *
7914PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7915 Py_ssize_t size,
7916 const char *errors)
7917{
Victor Stinner7581cef2011-11-03 22:32:33 +01007918 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007919 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007920 if (unicode == NULL)
7921 return NULL;
7922 res = encode_code_page(CP_ACP, unicode, errors);
7923 Py_DECREF(unicode);
7924 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007925}
7926
7927PyObject *
7928PyUnicode_EncodeCodePage(int code_page,
7929 PyObject *unicode,
7930 const char *errors)
7931{
Victor Stinner7581cef2011-11-03 22:32:33 +01007932 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007933}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007934
Alexander Belopolsky40018472011-02-26 01:02:56 +00007935PyObject *
7936PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007937{
Victor Stinner7581cef2011-11-03 22:32:33 +01007938 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007939}
7940
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007941#undef NEED_RETRY
7942
Steve Dowercc16be82016-09-08 10:35:16 -07007943#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007944
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945/* --- Character Mapping Codec -------------------------------------------- */
7946
Victor Stinnerfb161b12013-04-18 01:44:27 +02007947static int
7948charmap_decode_string(const char *s,
7949 Py_ssize_t size,
7950 PyObject *mapping,
7951 const char *errors,
7952 _PyUnicodeWriter *writer)
7953{
7954 const char *starts = s;
7955 const char *e;
7956 Py_ssize_t startinpos, endinpos;
7957 PyObject *errorHandler = NULL, *exc = NULL;
7958 Py_ssize_t maplen;
7959 enum PyUnicode_Kind mapkind;
7960 void *mapdata;
7961 Py_UCS4 x;
7962 unsigned char ch;
7963
7964 if (PyUnicode_READY(mapping) == -1)
7965 return -1;
7966
7967 maplen = PyUnicode_GET_LENGTH(mapping);
7968 mapdata = PyUnicode_DATA(mapping);
7969 mapkind = PyUnicode_KIND(mapping);
7970
7971 e = s + size;
7972
7973 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7974 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7975 * is disabled in encoding aliases, latin1 is preferred because
7976 * its implementation is faster. */
7977 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7978 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7979 Py_UCS4 maxchar = writer->maxchar;
7980
7981 assert (writer->kind == PyUnicode_1BYTE_KIND);
7982 while (s < e) {
7983 ch = *s;
7984 x = mapdata_ucs1[ch];
7985 if (x > maxchar) {
7986 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7987 goto onError;
7988 maxchar = writer->maxchar;
7989 outdata = (Py_UCS1 *)writer->data;
7990 }
7991 outdata[writer->pos] = x;
7992 writer->pos++;
7993 ++s;
7994 }
7995 return 0;
7996 }
7997
7998 while (s < e) {
7999 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8000 enum PyUnicode_Kind outkind = writer->kind;
8001 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
8002 if (outkind == PyUnicode_1BYTE_KIND) {
8003 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8004 Py_UCS4 maxchar = writer->maxchar;
8005 while (s < e) {
8006 ch = *s;
8007 x = mapdata_ucs2[ch];
8008 if (x > maxchar)
8009 goto Error;
8010 outdata[writer->pos] = x;
8011 writer->pos++;
8012 ++s;
8013 }
8014 break;
8015 }
8016 else if (outkind == PyUnicode_2BYTE_KIND) {
8017 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8018 while (s < e) {
8019 ch = *s;
8020 x = mapdata_ucs2[ch];
8021 if (x == 0xFFFE)
8022 goto Error;
8023 outdata[writer->pos] = x;
8024 writer->pos++;
8025 ++s;
8026 }
8027 break;
8028 }
8029 }
8030 ch = *s;
8031
8032 if (ch < maplen)
8033 x = PyUnicode_READ(mapkind, mapdata, ch);
8034 else
8035 x = 0xfffe; /* invalid value */
8036Error:
8037 if (x == 0xfffe)
8038 {
8039 /* undefined mapping */
8040 startinpos = s-starts;
8041 endinpos = startinpos+1;
8042 if (unicode_decode_call_errorhandler_writer(
8043 errors, &errorHandler,
8044 "charmap", "character maps to <undefined>",
8045 &starts, &e, &startinpos, &endinpos, &exc, &s,
8046 writer)) {
8047 goto onError;
8048 }
8049 continue;
8050 }
8051
8052 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8053 goto onError;
8054 ++s;
8055 }
8056 Py_XDECREF(errorHandler);
8057 Py_XDECREF(exc);
8058 return 0;
8059
8060onError:
8061 Py_XDECREF(errorHandler);
8062 Py_XDECREF(exc);
8063 return -1;
8064}
8065
8066static int
8067charmap_decode_mapping(const char *s,
8068 Py_ssize_t size,
8069 PyObject *mapping,
8070 const char *errors,
8071 _PyUnicodeWriter *writer)
8072{
8073 const char *starts = s;
8074 const char *e;
8075 Py_ssize_t startinpos, endinpos;
8076 PyObject *errorHandler = NULL, *exc = NULL;
8077 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008078 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008079
8080 e = s + size;
8081
8082 while (s < e) {
8083 ch = *s;
8084
8085 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8086 key = PyLong_FromLong((long)ch);
8087 if (key == NULL)
8088 goto onError;
8089
8090 item = PyObject_GetItem(mapping, key);
8091 Py_DECREF(key);
8092 if (item == NULL) {
8093 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8094 /* No mapping found means: mapping is undefined. */
8095 PyErr_Clear();
8096 goto Undefined;
8097 } else
8098 goto onError;
8099 }
8100
8101 /* Apply mapping */
8102 if (item == Py_None)
8103 goto Undefined;
8104 if (PyLong_Check(item)) {
8105 long value = PyLong_AS_LONG(item);
8106 if (value == 0xFFFE)
8107 goto Undefined;
8108 if (value < 0 || value > MAX_UNICODE) {
8109 PyErr_Format(PyExc_TypeError,
8110 "character mapping must be in range(0x%lx)",
8111 (unsigned long)MAX_UNICODE + 1);
8112 goto onError;
8113 }
8114
8115 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8116 goto onError;
8117 }
8118 else if (PyUnicode_Check(item)) {
8119 if (PyUnicode_READY(item) == -1)
8120 goto onError;
8121 if (PyUnicode_GET_LENGTH(item) == 1) {
8122 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8123 if (value == 0xFFFE)
8124 goto Undefined;
8125 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8126 goto onError;
8127 }
8128 else {
8129 writer->overallocate = 1;
8130 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8131 goto onError;
8132 }
8133 }
8134 else {
8135 /* wrong return value */
8136 PyErr_SetString(PyExc_TypeError,
8137 "character mapping must return integer, None or str");
8138 goto onError;
8139 }
8140 Py_CLEAR(item);
8141 ++s;
8142 continue;
8143
8144Undefined:
8145 /* undefined mapping */
8146 Py_CLEAR(item);
8147 startinpos = s-starts;
8148 endinpos = startinpos+1;
8149 if (unicode_decode_call_errorhandler_writer(
8150 errors, &errorHandler,
8151 "charmap", "character maps to <undefined>",
8152 &starts, &e, &startinpos, &endinpos, &exc, &s,
8153 writer)) {
8154 goto onError;
8155 }
8156 }
8157 Py_XDECREF(errorHandler);
8158 Py_XDECREF(exc);
8159 return 0;
8160
8161onError:
8162 Py_XDECREF(item);
8163 Py_XDECREF(errorHandler);
8164 Py_XDECREF(exc);
8165 return -1;
8166}
8167
Alexander Belopolsky40018472011-02-26 01:02:56 +00008168PyObject *
8169PyUnicode_DecodeCharmap(const char *s,
8170 Py_ssize_t size,
8171 PyObject *mapping,
8172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008174 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008175
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176 /* Default to Latin-1 */
8177 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008181 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008182 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008183 writer.min_length = size;
8184 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008186
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008187 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008188 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8189 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008190 }
8191 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008192 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008195 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008196
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008198 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 return NULL;
8200}
8201
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202/* Charmap encoding: the lookup table */
8203
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 PyObject_HEAD
8206 unsigned char level1[32];
8207 int count2, count3;
8208 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008209};
8210
8211static PyObject*
8212encoding_map_size(PyObject *obj, PyObject* args)
8213{
8214 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008215 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217}
8218
8219static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 PyDoc_STR("Return the size (in bytes) of this object") },
8222 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008223};
8224
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008225static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008226 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 "EncodingMap", /*tp_name*/
8228 sizeof(struct encoding_map), /*tp_basicsize*/
8229 0, /*tp_itemsize*/
8230 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008231 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008232 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 0, /*tp_getattr*/
8234 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008235 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 0, /*tp_repr*/
8237 0, /*tp_as_number*/
8238 0, /*tp_as_sequence*/
8239 0, /*tp_as_mapping*/
8240 0, /*tp_hash*/
8241 0, /*tp_call*/
8242 0, /*tp_str*/
8243 0, /*tp_getattro*/
8244 0, /*tp_setattro*/
8245 0, /*tp_as_buffer*/
8246 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8247 0, /*tp_doc*/
8248 0, /*tp_traverse*/
8249 0, /*tp_clear*/
8250 0, /*tp_richcompare*/
8251 0, /*tp_weaklistoffset*/
8252 0, /*tp_iter*/
8253 0, /*tp_iternext*/
8254 encoding_map_methods, /*tp_methods*/
8255 0, /*tp_members*/
8256 0, /*tp_getset*/
8257 0, /*tp_base*/
8258 0, /*tp_dict*/
8259 0, /*tp_descr_get*/
8260 0, /*tp_descr_set*/
8261 0, /*tp_dictoffset*/
8262 0, /*tp_init*/
8263 0, /*tp_alloc*/
8264 0, /*tp_new*/
8265 0, /*tp_free*/
8266 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008267};
8268
8269PyObject*
8270PyUnicode_BuildEncodingMap(PyObject* string)
8271{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272 PyObject *result;
8273 struct encoding_map *mresult;
8274 int i;
8275 int need_dict = 0;
8276 unsigned char level1[32];
8277 unsigned char level2[512];
8278 unsigned char *mlevel1, *mlevel2, *mlevel3;
8279 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280 int kind;
8281 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008282 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008284
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008285 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008286 PyErr_BadArgument();
8287 return NULL;
8288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 kind = PyUnicode_KIND(string);
8290 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008291 length = PyUnicode_GET_LENGTH(string);
8292 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008293 memset(level1, 0xFF, sizeof level1);
8294 memset(level2, 0xFF, sizeof level2);
8295
8296 /* If there isn't a one-to-one mapping of NULL to \0,
8297 or if there are non-BMP characters, we need to use
8298 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008300 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008301 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008302 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008303 ch = PyUnicode_READ(kind, data, i);
8304 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305 need_dict = 1;
8306 break;
8307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008309 /* unmapped character */
8310 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 l1 = ch >> 11;
8312 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313 if (level1[l1] == 0xFF)
8314 level1[l1] = count2++;
8315 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008316 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 }
8318
8319 if (count2 >= 0xFF || count3 >= 0xFF)
8320 need_dict = 1;
8321
8322 if (need_dict) {
8323 PyObject *result = PyDict_New();
8324 PyObject *key, *value;
8325 if (!result)
8326 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008327 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008329 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 if (!key || !value)
8331 goto failed1;
8332 if (PyDict_SetItem(result, key, value) == -1)
8333 goto failed1;
8334 Py_DECREF(key);
8335 Py_DECREF(value);
8336 }
8337 return result;
8338 failed1:
8339 Py_XDECREF(key);
8340 Py_XDECREF(value);
8341 Py_DECREF(result);
8342 return NULL;
8343 }
8344
8345 /* Create a three-level trie */
8346 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8347 16*count2 + 128*count3 - 1);
8348 if (!result)
8349 return PyErr_NoMemory();
8350 PyObject_Init(result, &EncodingMapType);
8351 mresult = (struct encoding_map*)result;
8352 mresult->count2 = count2;
8353 mresult->count3 = count3;
8354 mlevel1 = mresult->level1;
8355 mlevel2 = mresult->level23;
8356 mlevel3 = mresult->level23 + 16*count2;
8357 memcpy(mlevel1, level1, 32);
8358 memset(mlevel2, 0xFF, 16*count2);
8359 memset(mlevel3, 0, 128*count3);
8360 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008361 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008362 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008363 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8364 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 /* unmapped character */
8366 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008367 o1 = ch>>11;
8368 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008369 i2 = 16*mlevel1[o1] + o2;
8370 if (mlevel2[i2] == 0xFF)
8371 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008372 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008373 i3 = 128*mlevel2[i2] + o3;
8374 mlevel3[i3] = i;
8375 }
8376 return result;
8377}
8378
8379static int
Victor Stinner22168992011-11-20 17:09:18 +01008380encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008381{
8382 struct encoding_map *map = (struct encoding_map*)mapping;
8383 int l1 = c>>11;
8384 int l2 = (c>>7) & 0xF;
8385 int l3 = c & 0x7F;
8386 int i;
8387
Victor Stinner22168992011-11-20 17:09:18 +01008388 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008390 if (c == 0)
8391 return 0;
8392 /* level 1*/
8393 i = map->level1[l1];
8394 if (i == 0xFF) {
8395 return -1;
8396 }
8397 /* level 2*/
8398 i = map->level23[16*i+l2];
8399 if (i == 0xFF) {
8400 return -1;
8401 }
8402 /* level 3 */
8403 i = map->level23[16*map->count2 + 128*i + l3];
8404 if (i == 0) {
8405 return -1;
8406 }
8407 return i;
8408}
8409
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410/* Lookup the character ch in the mapping. If the character
8411 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008412 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008413static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008414charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415{
Christian Heimes217cfd12007-12-02 14:31:20 +00008416 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 PyObject *x;
8418
8419 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 x = PyObject_GetItem(mapping, w);
8422 Py_DECREF(w);
8423 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8425 /* No mapping found means: mapping is undefined. */
8426 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008427 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 } else
8429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008431 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008433 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 long value = PyLong_AS_LONG(x);
8435 if (value < 0 || value > 255) {
8436 PyErr_SetString(PyExc_TypeError,
8437 "character mapping must be in range(256)");
8438 Py_DECREF(x);
8439 return NULL;
8440 }
8441 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008443 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 /* wrong return value */
8447 PyErr_Format(PyExc_TypeError,
8448 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008449 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 Py_DECREF(x);
8451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 }
8453}
8454
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008456charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008457{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008458 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8459 /* exponentially overallocate to minimize reallocations */
8460 if (requiredsize < 2*outsize)
8461 requiredsize = 2*outsize;
8462 if (_PyBytes_Resize(outobj, requiredsize))
8463 return -1;
8464 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008465}
8466
Benjamin Peterson14339b62009-01-31 16:36:08 +00008467typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008469} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008471 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 space is available. Return a new reference to the object that
8473 was put in the output buffer, or Py_None, if the mapping was undefined
8474 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008475 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008476static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008477charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008478 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008480 PyObject *rep;
8481 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008482 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483
Andy Lesterdffe4c02020-03-04 07:15:20 -06008484 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008485 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008487 if (res == -1)
8488 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 if (outsize<requiredsize)
8490 if (charmapencode_resize(outobj, outpos, requiredsize))
8491 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008492 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 outstart[(*outpos)++] = (char)res;
8494 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008495 }
8496
8497 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008500 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 Py_DECREF(rep);
8502 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008503 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 if (PyLong_Check(rep)) {
8505 Py_ssize_t requiredsize = *outpos+1;
8506 if (outsize<requiredsize)
8507 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8508 Py_DECREF(rep);
8509 return enc_EXCEPTION;
8510 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008511 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 else {
8515 const char *repchars = PyBytes_AS_STRING(rep);
8516 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8517 Py_ssize_t requiredsize = *outpos+repsize;
8518 if (outsize<requiredsize)
8519 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8520 Py_DECREF(rep);
8521 return enc_EXCEPTION;
8522 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008523 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 memcpy(outstart + *outpos, repchars, repsize);
8525 *outpos += repsize;
8526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008528 Py_DECREF(rep);
8529 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530}
8531
8532/* handle an error in PyUnicode_EncodeCharmap
8533 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534static int
8535charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008538 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008539 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540{
8541 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008542 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008543 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008544 enum PyUnicode_Kind kind;
8545 void *data;
8546 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008548 Py_ssize_t collstartpos = *inpos;
8549 Py_ssize_t collendpos = *inpos+1;
8550 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008551 const char *encoding = "charmap";
8552 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008553 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008555 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556
Benjamin Petersonbac79492012-01-14 13:34:47 -05008557 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 return -1;
8559 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 /* find all unencodable characters */
8561 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008562 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008563 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008565 val = encoding_map_lookup(ch, mapping);
8566 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 break;
8568 ++collendpos;
8569 continue;
8570 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008571
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8573 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 if (rep==NULL)
8575 return -1;
8576 else if (rep!=Py_None) {
8577 Py_DECREF(rep);
8578 break;
8579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008580 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 }
8583 /* cache callback name lookup
8584 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008585 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008586 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008587
8588 switch (*error_handler) {
8589 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008590 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008591 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008592
8593 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008594 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 x = charmapencode_output('?', mapping, res, respos);
8596 if (x==enc_EXCEPTION) {
8597 return -1;
8598 }
8599 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008600 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 return -1;
8602 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008603 }
8604 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008605 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008606 *inpos = collendpos;
8607 break;
Victor Stinner50149202015-09-22 00:26:54 +02008608
8609 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008610 /* generate replacement (temporarily (mis)uses p) */
8611 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 char buffer[2+29+1+1];
8613 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008614 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 for (cp = buffer; *cp; ++cp) {
8616 x = charmapencode_output(*cp, mapping, res, respos);
8617 if (x==enc_EXCEPTION)
8618 return -1;
8619 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008620 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 return -1;
8622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008623 }
8624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008625 *inpos = collendpos;
8626 break;
Victor Stinner50149202015-09-22 00:26:54 +02008627
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 default:
Victor Stinner50149202015-09-22 00:26:54 +02008629 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008630 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008632 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008634 if (PyBytes_Check(repunicode)) {
8635 /* Directly copy bytes result to output. */
8636 Py_ssize_t outsize = PyBytes_Size(*res);
8637 Py_ssize_t requiredsize;
8638 repsize = PyBytes_Size(repunicode);
8639 requiredsize = *respos + repsize;
8640 if (requiredsize > outsize)
8641 /* Make room for all additional bytes. */
8642 if (charmapencode_resize(res, respos, requiredsize)) {
8643 Py_DECREF(repunicode);
8644 return -1;
8645 }
8646 memcpy(PyBytes_AsString(*res) + *respos,
8647 PyBytes_AsString(repunicode), repsize);
8648 *respos += repsize;
8649 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008650 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008651 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008652 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008653 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008654 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008655 Py_DECREF(repunicode);
8656 return -1;
8657 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008658 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008659 data = PyUnicode_DATA(repunicode);
8660 kind = PyUnicode_KIND(repunicode);
8661 for (index = 0; index < repsize; index++) {
8662 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8663 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008665 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 return -1;
8667 }
8668 else if (x==enc_FAILED) {
8669 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008670 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 return -1;
8672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008673 }
8674 *inpos = newpos;
8675 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
8677 return 0;
8678}
8679
Alexander Belopolsky40018472011-02-26 01:02:56 +00008680PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008681_PyUnicode_EncodeCharmap(PyObject *unicode,
8682 PyObject *mapping,
8683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 /* output object */
8686 PyObject *res = NULL;
8687 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008688 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008689 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008691 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008692 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008694 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008695 void *data;
8696 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697
Benjamin Petersonbac79492012-01-14 13:34:47 -05008698 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008699 return NULL;
8700 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008701 data = PyUnicode_DATA(unicode);
8702 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008703
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 /* Default to Latin-1 */
8705 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008706 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 /* allocate enough for a simple encoding without
8709 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008710 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 if (res == NULL)
8712 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008713 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008717 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008719 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 if (x==enc_EXCEPTION) /* error */
8721 goto onError;
8722 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008723 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008725 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 &res, &respos)) {
8727 goto onError;
8728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 else
8731 /* done with this character => adjust input position */
8732 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008736 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008737 if (_PyBytes_Resize(&res, respos) < 0)
8738 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008739
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008741 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 return res;
8743
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 Py_XDECREF(res);
8746 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008747 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 return NULL;
8749}
8750
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008751/* Deprecated */
8752PyObject *
8753PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8754 Py_ssize_t size,
8755 PyObject *mapping,
8756 const char *errors)
8757{
8758 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008759 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008760 if (unicode == NULL)
8761 return NULL;
8762 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8763 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008764 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008765}
8766
Alexander Belopolsky40018472011-02-26 01:02:56 +00008767PyObject *
8768PyUnicode_AsCharmapString(PyObject *unicode,
8769 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770{
8771 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 PyErr_BadArgument();
8773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008775 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776}
8777
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008778/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008779static void
8780make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008782 Py_ssize_t startpos, Py_ssize_t endpos,
8783 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008784{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 *exceptionObject = _PyUnicodeTranslateError_Create(
8787 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788 }
8789 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008790 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8791 goto onError;
8792 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8793 goto onError;
8794 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8795 goto onError;
8796 return;
8797 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008798 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 }
8800}
8801
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802/* error handling callback helper:
8803 build arguments, call the callback and check the arguments,
8804 put the result into newpos and return the replacement string, which
8805 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008806static PyObject *
8807unicode_translate_call_errorhandler(const char *errors,
8808 PyObject **errorHandler,
8809 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008811 Py_ssize_t startpos, Py_ssize_t endpos,
8812 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008813{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008814 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008815
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008816 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008817 PyObject *restuple;
8818 PyObject *resunicode;
8819
8820 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008824 }
8825
8826 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008830
Petr Viktorinffd97532020-02-11 17:46:57 +01008831 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008834 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008835 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 Py_DECREF(restuple);
8837 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008838 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008839 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 &resunicode, &i_newpos)) {
8841 Py_DECREF(restuple);
8842 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008844 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008846 else
8847 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008849 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 Py_DECREF(restuple);
8851 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008853 Py_INCREF(resunicode);
8854 Py_DECREF(restuple);
8855 return resunicode;
8856}
8857
8858/* Lookup the character ch in the mapping and put the result in result,
8859 which must be decrefed by the caller.
8860 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008861static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008863{
Christian Heimes217cfd12007-12-02 14:31:20 +00008864 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008865 PyObject *x;
8866
8867 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008869 x = PyObject_GetItem(mapping, w);
8870 Py_DECREF(w);
8871 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8873 /* No mapping found means: use 1:1 mapping. */
8874 PyErr_Clear();
8875 *result = NULL;
8876 return 0;
8877 } else
8878 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008879 }
8880 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 *result = x;
8882 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008884 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008886 if (value < 0 || value > MAX_UNICODE) {
8887 PyErr_Format(PyExc_ValueError,
8888 "character mapping must be in range(0x%x)",
8889 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 Py_DECREF(x);
8891 return -1;
8892 }
8893 *result = x;
8894 return 0;
8895 }
8896 else if (PyUnicode_Check(x)) {
8897 *result = x;
8898 return 0;
8899 }
8900 else {
8901 /* wrong return value */
8902 PyErr_SetString(PyExc_TypeError,
8903 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008904 Py_DECREF(x);
8905 return -1;
8906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008907}
Victor Stinner1194ea02014-04-04 19:37:40 +02008908
8909/* lookup the character, write the result into the writer.
8910 Return 1 if the result was written into the writer, return 0 if the mapping
8911 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008912static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008913charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8914 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915{
Victor Stinner1194ea02014-04-04 19:37:40 +02008916 PyObject *item;
8917
8918 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008920
8921 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008923 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008926 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008927 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008928
8929 if (item == Py_None) {
8930 Py_DECREF(item);
8931 return 0;
8932 }
8933
8934 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008935 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8936 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8937 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008938 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8939 Py_DECREF(item);
8940 return -1;
8941 }
8942 Py_DECREF(item);
8943 return 1;
8944 }
8945
8946 if (!PyUnicode_Check(item)) {
8947 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008949 }
8950
8951 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8952 Py_DECREF(item);
8953 return -1;
8954 }
8955
8956 Py_DECREF(item);
8957 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008958}
8959
Victor Stinner89a76ab2014-04-05 11:44:04 +02008960static int
8961unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8962 Py_UCS1 *translate)
8963{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008964 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 int ret = 0;
8966
Victor Stinner89a76ab2014-04-05 11:44:04 +02008967 if (charmaptranslate_lookup(ch, mapping, &item)) {
8968 return -1;
8969 }
8970
8971 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008972 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008973 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008974 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008975 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008976 /* not found => default to 1:1 mapping */
8977 translate[ch] = ch;
8978 return 1;
8979 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008980 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008981 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008982 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8983 used it */
8984 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008985 /* invalid character or character outside ASCII:
8986 skip the fast translate */
8987 goto exit;
8988 }
8989 translate[ch] = (Py_UCS1)replace;
8990 }
8991 else if (PyUnicode_Check(item)) {
8992 Py_UCS4 replace;
8993
8994 if (PyUnicode_READY(item) == -1) {
8995 Py_DECREF(item);
8996 return -1;
8997 }
8998 if (PyUnicode_GET_LENGTH(item) != 1)
8999 goto exit;
9000
9001 replace = PyUnicode_READ_CHAR(item, 0);
9002 if (replace > 127)
9003 goto exit;
9004 translate[ch] = (Py_UCS1)replace;
9005 }
9006 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009007 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008 goto exit;
9009 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009010 ret = 1;
9011
Benjamin Peterson1365de72014-04-07 20:15:41 -04009012 exit:
9013 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 return ret;
9015}
9016
9017/* Fast path for ascii => ascii translation. Return 1 if the whole string
9018 was translated into writer, return 0 if the input string was partially
9019 translated into writer, raise an exception and return -1 on error. */
9020static int
9021unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009022 _PyUnicodeWriter *writer, int ignore,
9023 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009024{
Victor Stinner872b2912014-04-05 14:27:07 +02009025 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009026 Py_ssize_t len;
9027 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009028 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009029
Victor Stinner89a76ab2014-04-05 11:44:04 +02009030 len = PyUnicode_GET_LENGTH(input);
9031
Victor Stinner872b2912014-04-05 14:27:07 +02009032 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009033
9034 in = PyUnicode_1BYTE_DATA(input);
9035 end = in + len;
9036
9037 assert(PyUnicode_IS_ASCII(writer->buffer));
9038 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9039 out = PyUnicode_1BYTE_DATA(writer->buffer);
9040
Victor Stinner872b2912014-04-05 14:27:07 +02009041 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009042 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009043 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009044 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009045 int translate = unicode_fast_translate_lookup(mapping, ch,
9046 ascii_table);
9047 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009048 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009049 if (translate == 0)
9050 goto exit;
9051 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009052 }
Victor Stinner872b2912014-04-05 14:27:07 +02009053 if (ch2 == 0xfe) {
9054 if (ignore)
9055 continue;
9056 goto exit;
9057 }
9058 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009059 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009060 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009061 }
Victor Stinner872b2912014-04-05 14:27:07 +02009062 res = 1;
9063
9064exit:
9065 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009066 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009067 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009068}
9069
Victor Stinner3222da22015-10-01 22:07:32 +02009070static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071_PyUnicode_TranslateCharmap(PyObject *input,
9072 PyObject *mapping,
9073 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009076 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 Py_ssize_t size, i;
9078 int kind;
9079 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009080 _PyUnicodeWriter writer;
9081 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009082 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009083 PyObject *errorHandler = NULL;
9084 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009085 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009086 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009087
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 PyErr_BadArgument();
9090 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 if (PyUnicode_READY(input) == -1)
9094 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009095 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 kind = PyUnicode_KIND(input);
9097 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009099 if (size == 0)
9100 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009102 /* allocate enough for a simple 1:1 translation without
9103 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009104 _PyUnicodeWriter_Init(&writer);
9105 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107
Victor Stinner872b2912014-04-05 14:27:07 +02009108 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9109
Victor Stinner33798672016-03-01 21:59:58 +01009110 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009111 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009112 if (PyUnicode_IS_ASCII(input)) {
9113 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9114 if (res < 0) {
9115 _PyUnicodeWriter_Dealloc(&writer);
9116 return NULL;
9117 }
9118 if (res == 1)
9119 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009120 }
Victor Stinner33798672016-03-01 21:59:58 +01009121 else {
9122 i = 0;
9123 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009127 int translate;
9128 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9129 Py_ssize_t newpos;
9130 /* startpos for collecting untranslatable chars */
9131 Py_ssize_t collstart;
9132 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009133 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134
Victor Stinner1194ea02014-04-04 19:37:40 +02009135 ch = PyUnicode_READ(kind, data, i);
9136 translate = charmaptranslate_output(ch, mapping, &writer);
9137 if (translate < 0)
9138 goto onError;
9139
9140 if (translate != 0) {
9141 /* it worked => adjust input pointer */
9142 ++i;
9143 continue;
9144 }
9145
9146 /* untranslatable character */
9147 collstart = i;
9148 collend = i+1;
9149
9150 /* find all untranslatable characters */
9151 while (collend < size) {
9152 PyObject *x;
9153 ch = PyUnicode_READ(kind, data, collend);
9154 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009155 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009156 Py_XDECREF(x);
9157 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009159 ++collend;
9160 }
9161
9162 if (ignore) {
9163 i = collend;
9164 }
9165 else {
9166 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9167 reason, input, &exc,
9168 collstart, collend, &newpos);
9169 if (repunicode == NULL)
9170 goto onError;
9171 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009172 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009173 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009174 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009175 Py_DECREF(repunicode);
9176 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009177 }
9178 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009179 Py_XDECREF(exc);
9180 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009181 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009184 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009185 Py_XDECREF(exc);
9186 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187 return NULL;
9188}
9189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190/* Deprecated. Use PyUnicode_Translate instead. */
9191PyObject *
9192PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9193 Py_ssize_t size,
9194 PyObject *mapping,
9195 const char *errors)
9196{
Christian Heimes5f520f42012-09-11 14:03:25 +02009197 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009198 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 if (!unicode)
9200 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009201 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9202 Py_DECREF(unicode);
9203 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204}
9205
Alexander Belopolsky40018472011-02-26 01:02:56 +00009206PyObject *
9207PyUnicode_Translate(PyObject *str,
9208 PyObject *mapping,
9209 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009211 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009212 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009213 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214}
Tim Petersced69f82003-09-16 20:30:58 +00009215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216PyObject *
9217_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9218{
9219 if (!PyUnicode_Check(unicode)) {
9220 PyErr_BadInternalCall();
9221 return NULL;
9222 }
9223 if (PyUnicode_READY(unicode) == -1)
9224 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009225 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 /* If the string is already ASCII, just return the same string */
9227 Py_INCREF(unicode);
9228 return unicode;
9229 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009230
9231 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9232 PyObject *result = PyUnicode_New(len, 127);
9233 if (result == NULL) {
9234 return NULL;
9235 }
9236
9237 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9238 int kind = PyUnicode_KIND(unicode);
9239 const void *data = PyUnicode_DATA(unicode);
9240 Py_ssize_t i;
9241 for (i = 0; i < len; ++i) {
9242 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9243 if (ch < 127) {
9244 out[i] = ch;
9245 }
9246 else if (Py_UNICODE_ISSPACE(ch)) {
9247 out[i] = ' ';
9248 }
9249 else {
9250 int decimal = Py_UNICODE_TODECIMAL(ch);
9251 if (decimal < 0) {
9252 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009253 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009254 _PyUnicode_LENGTH(result) = i + 1;
9255 break;
9256 }
9257 out[i] = '0' + decimal;
9258 }
9259 }
9260
INADA Naoki16dfca42018-07-14 12:06:43 +09009261 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009262 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263}
9264
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009265PyObject *
9266PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9267 Py_ssize_t length)
9268{
Victor Stinnerf0124502011-11-21 23:12:56 +01009269 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009270 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009271 Py_UCS4 maxchar;
9272 enum PyUnicode_Kind kind;
9273 void *data;
9274
Victor Stinner99d7ad02012-02-22 13:37:39 +01009275 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009276 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009277 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009278 if (ch > 127) {
9279 int decimal = Py_UNICODE_TODECIMAL(ch);
9280 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009281 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009282 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009283 }
9284 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009285
9286 /* Copy to a new string */
9287 decimal = PyUnicode_New(length, maxchar);
9288 if (decimal == NULL)
9289 return decimal;
9290 kind = PyUnicode_KIND(decimal);
9291 data = PyUnicode_DATA(decimal);
9292 /* Iterate over code points */
9293 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009294 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009295 if (ch > 127) {
9296 int decimal = Py_UNICODE_TODECIMAL(ch);
9297 if (decimal >= 0)
9298 ch = '0' + decimal;
9299 }
9300 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009302 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009303}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009304/* --- Decimal Encoder ---------------------------------------------------- */
9305
Alexander Belopolsky40018472011-02-26 01:02:56 +00009306int
9307PyUnicode_EncodeDecimal(Py_UNICODE *s,
9308 Py_ssize_t length,
9309 char *output,
9310 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009311{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009312 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009313 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009314 enum PyUnicode_Kind kind;
9315 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009316
9317 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 PyErr_BadArgument();
9319 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009320 }
9321
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009322 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009323 if (unicode == NULL)
9324 return -1;
9325
Victor Stinner42bf7752011-11-21 22:52:58 +01009326 kind = PyUnicode_KIND(unicode);
9327 data = PyUnicode_DATA(unicode);
9328
Victor Stinnerb84d7232011-11-22 01:50:07 +01009329 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009330 PyObject *exc;
9331 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009332 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009333 Py_ssize_t startpos;
9334
9335 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009336
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009338 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009339 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009340 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 decimal = Py_UNICODE_TODECIMAL(ch);
9343 if (decimal >= 0) {
9344 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009345 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 continue;
9347 }
9348 if (0 < ch && ch < 256) {
9349 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009350 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 continue;
9352 }
Victor Stinner6345be92011-11-25 20:09:01 +01009353
Victor Stinner42bf7752011-11-21 22:52:58 +01009354 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009355 exc = NULL;
9356 raise_encode_exception(&exc, "decimal", unicode,
9357 startpos, startpos+1,
9358 "invalid decimal Unicode string");
9359 Py_XDECREF(exc);
9360 Py_DECREF(unicode);
9361 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009362 }
9363 /* 0-terminate the output string */
9364 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009365 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009366 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009367}
9368
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369/* --- Helpers ------------------------------------------------------------ */
9370
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009371/* helper macro to fixup start/end slice values */
9372#define ADJUST_INDICES(start, end, len) \
9373 if (end > len) \
9374 end = len; \
9375 else if (end < 0) { \
9376 end += len; \
9377 if (end < 0) \
9378 end = 0; \
9379 } \
9380 if (start < 0) { \
9381 start += len; \
9382 if (start < 0) \
9383 start = 0; \
9384 }
9385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009387any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009389 Py_ssize_t end,
9390 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009392 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 void *buf1, *buf2;
9394 Py_ssize_t len1, len2, result;
9395
9396 kind1 = PyUnicode_KIND(s1);
9397 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009398 if (kind1 < kind2)
9399 return -1;
9400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 len1 = PyUnicode_GET_LENGTH(s1);
9402 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009403 ADJUST_INDICES(start, end, len1);
9404 if (end - start < len2)
9405 return -1;
9406
9407 buf1 = PyUnicode_DATA(s1);
9408 buf2 = PyUnicode_DATA(s2);
9409 if (len2 == 1) {
9410 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9411 result = findchar((const char *)buf1 + kind1*start,
9412 kind1, end - start, ch, direction);
9413 if (result == -1)
9414 return -1;
9415 else
9416 return start + result;
9417 }
9418
9419 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009420 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009421 if (!buf2)
9422 return -2;
9423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424
Victor Stinner794d5672011-10-10 03:21:36 +02009425 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009427 case PyUnicode_1BYTE_KIND:
9428 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9429 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9430 else
9431 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9432 break;
9433 case PyUnicode_2BYTE_KIND:
9434 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9435 break;
9436 case PyUnicode_4BYTE_KIND:
9437 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9438 break;
9439 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009440 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009441 }
9442 }
9443 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009444 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009445 case PyUnicode_1BYTE_KIND:
9446 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9447 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9448 else
9449 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9453 break;
9454 case PyUnicode_4BYTE_KIND:
9455 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9456 break;
9457 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009458 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 }
9461
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009462 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 PyMem_Free(buf2);
9464
9465 return result;
9466}
9467
Victor Stinner59423e32018-11-26 13:40:01 +01009468/* _PyUnicode_InsertThousandsGrouping() helper functions */
9469#include "stringlib/localeutil.h"
9470
9471/**
9472 * InsertThousandsGrouping:
9473 * @writer: Unicode writer.
9474 * @n_buffer: Number of characters in @buffer.
9475 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9476 * @d_pos: Start of digits string.
9477 * @n_digits: The number of digits in the string, in which we want
9478 * to put the grouping chars.
9479 * @min_width: The minimum width of the digits in the output string.
9480 * Output will be zero-padded on the left to fill.
9481 * @grouping: see definition in localeconv().
9482 * @thousands_sep: see definition in localeconv().
9483 *
9484 * There are 2 modes: counting and filling. If @writer is NULL,
9485 * we are in counting mode, else filling mode.
9486 * If counting, the required buffer size is returned.
9487 * If filling, we know the buffer will be large enough, so we don't
9488 * need to pass in the buffer size.
9489 * Inserts thousand grouping characters (as defined by grouping and
9490 * thousands_sep) into @writer.
9491 *
9492 * Return value: -1 on error, number of characters otherwise.
9493 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009495_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009496 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009497 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009498 PyObject *digits,
9499 Py_ssize_t d_pos,
9500 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009501 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009502 const char *grouping,
9503 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009504 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505{
Xtreak3f7983a2019-01-07 20:39:14 +05309506 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009507 if (writer) {
9508 assert(digits != NULL);
9509 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009510 }
9511 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009512 assert(digits == NULL);
9513 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009514 }
Victor Stinner59423e32018-11-26 13:40:01 +01009515 assert(0 <= d_pos);
9516 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009517 assert(grouping != NULL);
9518
9519 if (digits != NULL) {
9520 if (PyUnicode_READY(digits) == -1) {
9521 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009522 }
Victor Stinner59423e32018-11-26 13:40:01 +01009523 }
9524 if (PyUnicode_READY(thousands_sep) == -1) {
9525 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009526 }
9527
Victor Stinner59423e32018-11-26 13:40:01 +01009528 Py_ssize_t count = 0;
9529 Py_ssize_t n_zeros;
9530 int loop_broken = 0;
9531 int use_separator = 0; /* First time through, don't append the
9532 separator. They only go between
9533 groups. */
9534 Py_ssize_t buffer_pos;
9535 Py_ssize_t digits_pos;
9536 Py_ssize_t len;
9537 Py_ssize_t n_chars;
9538 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9539 be looked at */
9540 /* A generator that returns all of the grouping widths, until it
9541 returns 0. */
9542 GroupGenerator groupgen;
9543 GroupGenerator_init(&groupgen, grouping);
9544 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9545
9546 /* if digits are not grouped, thousands separator
9547 should be an empty string */
9548 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9549
9550 digits_pos = d_pos + n_digits;
9551 if (writer) {
9552 buffer_pos = writer->pos + n_buffer;
9553 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9554 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 }
Victor Stinner59423e32018-11-26 13:40:01 +01009556 else {
9557 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009558 }
Victor Stinner59423e32018-11-26 13:40:01 +01009559
9560 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009561 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009562 }
Victor Stinner59423e32018-11-26 13:40:01 +01009563
9564 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9565 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9566 n_zeros = Py_MAX(0, len - remaining);
9567 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9568
9569 /* Use n_zero zero's and n_chars chars */
9570
9571 /* Count only, don't do anything. */
9572 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9573
9574 /* Copy into the writer. */
9575 InsertThousandsGrouping_fill(writer, &buffer_pos,
9576 digits, &digits_pos,
9577 n_chars, n_zeros,
9578 use_separator ? thousands_sep : NULL,
9579 thousands_sep_len, maxchar);
9580
9581 /* Use a separator next time. */
9582 use_separator = 1;
9583
9584 remaining -= n_chars;
9585 min_width -= len;
9586
9587 if (remaining <= 0 && min_width <= 0) {
9588 loop_broken = 1;
9589 break;
9590 }
9591 min_width -= thousands_sep_len;
9592 }
9593 if (!loop_broken) {
9594 /* We left the loop without using a break statement. */
9595
9596 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9597 n_zeros = Py_MAX(0, len - remaining);
9598 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9599
9600 /* Use n_zero zero's and n_chars chars */
9601 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9602
9603 /* Copy into the writer. */
9604 InsertThousandsGrouping_fill(writer, &buffer_pos,
9605 digits, &digits_pos,
9606 n_chars, n_zeros,
9607 use_separator ? thousands_sep : NULL,
9608 thousands_sep_len, maxchar);
9609 }
9610 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611}
9612
9613
Alexander Belopolsky40018472011-02-26 01:02:56 +00009614Py_ssize_t
9615PyUnicode_Count(PyObject *str,
9616 PyObject *substr,
9617 Py_ssize_t start,
9618 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009620 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009621 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 void *buf1 = NULL, *buf2 = NULL;
9623 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009624
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009625 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009626 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009628 kind1 = PyUnicode_KIND(str);
9629 kind2 = PyUnicode_KIND(substr);
9630 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009631 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009632
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009633 len1 = PyUnicode_GET_LENGTH(str);
9634 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009636 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009637 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009638
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009639 buf1 = PyUnicode_DATA(str);
9640 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009641 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009642 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009643 if (!buf2)
9644 goto onError;
9645 }
9646
9647 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009649 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009650 result = asciilib_count(
9651 ((Py_UCS1*)buf1) + start, end - start,
9652 buf2, len2, PY_SSIZE_T_MAX
9653 );
9654 else
9655 result = ucs1lib_count(
9656 ((Py_UCS1*)buf1) + start, end - start,
9657 buf2, len2, PY_SSIZE_T_MAX
9658 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 break;
9660 case PyUnicode_2BYTE_KIND:
9661 result = ucs2lib_count(
9662 ((Py_UCS2*)buf1) + start, end - start,
9663 buf2, len2, PY_SSIZE_T_MAX
9664 );
9665 break;
9666 case PyUnicode_4BYTE_KIND:
9667 result = ucs4lib_count(
9668 ((Py_UCS4*)buf1) + start, end - start,
9669 buf2, len2, PY_SSIZE_T_MAX
9670 );
9671 break;
9672 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009673 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009675
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009676 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 PyMem_Free(buf2);
9678
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009681 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 PyMem_Free(buf2);
9683 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684}
9685
Alexander Belopolsky40018472011-02-26 01:02:56 +00009686Py_ssize_t
9687PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009688 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009689 Py_ssize_t start,
9690 Py_ssize_t end,
9691 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009693 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009695
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009696 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697}
9698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699Py_ssize_t
9700PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9701 Py_ssize_t start, Py_ssize_t end,
9702 int direction)
9703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009705 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 if (PyUnicode_READY(str) == -1)
9707 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009708 len = PyUnicode_GET_LENGTH(str);
9709 ADJUST_INDICES(start, end, len);
9710 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009711 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009713 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9714 kind, end-start, ch, direction);
9715 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009717 else
9718 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719}
9720
Alexander Belopolsky40018472011-02-26 01:02:56 +00009721static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009722tailmatch(PyObject *self,
9723 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009724 Py_ssize_t start,
9725 Py_ssize_t end,
9726 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 int kind_self;
9729 int kind_sub;
9730 void *data_self;
9731 void *data_sub;
9732 Py_ssize_t offset;
9733 Py_ssize_t i;
9734 Py_ssize_t end_sub;
9735
9736 if (PyUnicode_READY(self) == -1 ||
9737 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009738 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9741 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009745 if (PyUnicode_GET_LENGTH(substring) == 0)
9746 return 1;
9747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 kind_self = PyUnicode_KIND(self);
9749 data_self = PyUnicode_DATA(self);
9750 kind_sub = PyUnicode_KIND(substring);
9751 data_sub = PyUnicode_DATA(substring);
9752 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9753
9754 if (direction > 0)
9755 offset = end;
9756 else
9757 offset = start;
9758
9759 if (PyUnicode_READ(kind_self, data_self, offset) ==
9760 PyUnicode_READ(kind_sub, data_sub, 0) &&
9761 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9762 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9763 /* If both are of the same kind, memcmp is sufficient */
9764 if (kind_self == kind_sub) {
9765 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009766 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 data_sub,
9768 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009769 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009771 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 else {
9773 /* We do not need to compare 0 and len(substring)-1 because
9774 the if statement above ensured already that they are equal
9775 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 for (i = 1; i < end_sub; ++i) {
9777 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9778 PyUnicode_READ(kind_sub, data_sub, i))
9779 return 0;
9780 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783 }
9784
9785 return 0;
9786}
9787
Alexander Belopolsky40018472011-02-26 01:02:56 +00009788Py_ssize_t
9789PyUnicode_Tailmatch(PyObject *str,
9790 PyObject *substr,
9791 Py_ssize_t start,
9792 Py_ssize_t end,
9793 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009795 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009796 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009797
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009798 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799}
9800
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009801static PyObject *
9802ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009804 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9805 char *resdata, *data = PyUnicode_DATA(self);
9806 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009807
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009808 res = PyUnicode_New(len, 127);
9809 if (res == NULL)
9810 return NULL;
9811 resdata = PyUnicode_DATA(res);
9812 if (lower)
9813 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009815 _Py_bytes_upper(resdata, data, len);
9816 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817}
9818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 Py_ssize_t j;
9823 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009824 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009825 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009826
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009827 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9828
9829 where ! is a negation and \p{xxx} is a character with property xxx.
9830 */
9831 for (j = i - 1; j >= 0; j--) {
9832 c = PyUnicode_READ(kind, data, j);
9833 if (!_PyUnicode_IsCaseIgnorable(c))
9834 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009836 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9837 if (final_sigma) {
9838 for (j = i + 1; j < length; j++) {
9839 c = PyUnicode_READ(kind, data, j);
9840 if (!_PyUnicode_IsCaseIgnorable(c))
9841 break;
9842 }
9843 final_sigma = j == length || !_PyUnicode_IsCased(c);
9844 }
9845 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846}
9847
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009848static int
9849lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9850 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009852 /* Obscure special case. */
9853 if (c == 0x3A3) {
9854 mapped[0] = handle_capital_sigma(kind, data, length, i);
9855 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009857 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858}
9859
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009860static Py_ssize_t
9861do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009863 Py_ssize_t i, k = 0;
9864 int n_res, j;
9865 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009866
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009867 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009868 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009869 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009870 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009871 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009873 for (i = 1; i < length; i++) {
9874 c = PyUnicode_READ(kind, data, i);
9875 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9876 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009877 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009879 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009880 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882}
9883
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009884static Py_ssize_t
9885do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9886 Py_ssize_t i, k = 0;
9887
9888 for (i = 0; i < length; i++) {
9889 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9890 int n_res, j;
9891 if (Py_UNICODE_ISUPPER(c)) {
9892 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9893 }
9894 else if (Py_UNICODE_ISLOWER(c)) {
9895 n_res = _PyUnicode_ToUpperFull(c, mapped);
9896 }
9897 else {
9898 n_res = 1;
9899 mapped[0] = c;
9900 }
9901 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009902 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009903 res[k++] = mapped[j];
9904 }
9905 }
9906 return k;
9907}
9908
9909static Py_ssize_t
9910do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9911 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009913 Py_ssize_t i, k = 0;
9914
9915 for (i = 0; i < length; i++) {
9916 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9917 int n_res, j;
9918 if (lower)
9919 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9920 else
9921 n_res = _PyUnicode_ToUpperFull(c, mapped);
9922 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009923 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009924 res[k++] = mapped[j];
9925 }
9926 }
9927 return k;
9928}
9929
9930static Py_ssize_t
9931do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9932{
9933 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9934}
9935
9936static Py_ssize_t
9937do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9938{
9939 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9940}
9941
Benjamin Petersone51757f2012-01-12 21:10:29 -05009942static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009943do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9944{
9945 Py_ssize_t i, k = 0;
9946
9947 for (i = 0; i < length; i++) {
9948 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9949 Py_UCS4 mapped[3];
9950 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9951 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009952 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009953 res[k++] = mapped[j];
9954 }
9955 }
9956 return k;
9957}
9958
9959static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009960do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9961{
9962 Py_ssize_t i, k = 0;
9963 int previous_is_cased;
9964
9965 previous_is_cased = 0;
9966 for (i = 0; i < length; i++) {
9967 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9968 Py_UCS4 mapped[3];
9969 int n_res, j;
9970
9971 if (previous_is_cased)
9972 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9973 else
9974 n_res = _PyUnicode_ToTitleFull(c, mapped);
9975
9976 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009977 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009978 res[k++] = mapped[j];
9979 }
9980
9981 previous_is_cased = _PyUnicode_IsCased(c);
9982 }
9983 return k;
9984}
9985
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009986static PyObject *
9987case_operation(PyObject *self,
9988 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9989{
9990 PyObject *res = NULL;
9991 Py_ssize_t length, newlength = 0;
9992 int kind, outkind;
9993 void *data, *outdata;
9994 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9995
Benjamin Petersoneea48462012-01-16 14:28:50 -05009996 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009997
9998 kind = PyUnicode_KIND(self);
9999 data = PyUnicode_DATA(self);
10000 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010001 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010002 PyErr_SetString(PyExc_OverflowError, "string is too long");
10003 return NULL;
10004 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010005 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010006 if (tmp == NULL)
10007 return PyErr_NoMemory();
10008 newlength = perform(kind, data, length, tmp, &maxchar);
10009 res = PyUnicode_New(newlength, maxchar);
10010 if (res == NULL)
10011 goto leave;
10012 tmpend = tmp + newlength;
10013 outdata = PyUnicode_DATA(res);
10014 outkind = PyUnicode_KIND(res);
10015 switch (outkind) {
10016 case PyUnicode_1BYTE_KIND:
10017 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10018 break;
10019 case PyUnicode_2BYTE_KIND:
10020 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10021 break;
10022 case PyUnicode_4BYTE_KIND:
10023 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10024 break;
10025 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010026 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010027 }
10028 leave:
10029 PyMem_FREE(tmp);
10030 return res;
10031}
10032
Tim Peters8ce9f162004-08-27 01:49:32 +000010033PyObject *
10034PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010036 PyObject *res;
10037 PyObject *fseq;
10038 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010039 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010041 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010042 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010043 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010044 }
10045
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010046 /* NOTE: the following code can't call back into Python code,
10047 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010048 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010049
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010050 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010051 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010052 res = _PyUnicode_JoinArray(separator, items, seqlen);
10053 Py_DECREF(fseq);
10054 return res;
10055}
10056
10057PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010058_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010059{
10060 PyObject *res = NULL; /* the result */
10061 PyObject *sep = NULL;
10062 Py_ssize_t seplen;
10063 PyObject *item;
10064 Py_ssize_t sz, i, res_offset;
10065 Py_UCS4 maxchar;
10066 Py_UCS4 item_maxchar;
10067 int use_memcpy;
10068 unsigned char *res_data = NULL, *sep_data = NULL;
10069 PyObject *last_obj;
10070 unsigned int kind = 0;
10071
Tim Peters05eba1f2004-08-27 21:32:02 +000010072 /* If empty sequence, return u"". */
10073 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010074 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010075 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010076
Tim Peters05eba1f2004-08-27 21:32:02 +000010077 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010078 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010079 if (seqlen == 1) {
10080 if (PyUnicode_CheckExact(items[0])) {
10081 res = items[0];
10082 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010083 return res;
10084 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010085 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010086 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010087 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010088 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010089 /* Set up sep and seplen */
10090 if (separator == NULL) {
10091 /* fall back to a blank space separator */
10092 sep = PyUnicode_FromOrdinal(' ');
10093 if (!sep)
10094 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010095 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010096 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010097 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010098 else {
10099 if (!PyUnicode_Check(separator)) {
10100 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010101 "separator: expected str instance,"
10102 " %.80s found",
10103 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010104 goto onError;
10105 }
10106 if (PyUnicode_READY(separator))
10107 goto onError;
10108 sep = separator;
10109 seplen = PyUnicode_GET_LENGTH(separator);
10110 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10111 /* inc refcount to keep this code path symmetric with the
10112 above case of a blank separator */
10113 Py_INCREF(sep);
10114 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010115 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010116 }
10117
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010118 /* There are at least two things to join, or else we have a subclass
10119 * of str in the sequence.
10120 * Do a pre-pass to figure out the total amount of space we'll
10121 * need (sz), and see whether all argument are strings.
10122 */
10123 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010124#ifdef Py_DEBUG
10125 use_memcpy = 0;
10126#else
10127 use_memcpy = 1;
10128#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010129 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010130 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010131 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010132 if (!PyUnicode_Check(item)) {
10133 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010134 "sequence item %zd: expected str instance,"
10135 " %.80s found",
10136 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 goto onError;
10138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 if (PyUnicode_READY(item) == -1)
10140 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010141 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010143 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010144 if (i != 0) {
10145 add_sz += seplen;
10146 }
10147 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010148 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010149 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010150 goto onError;
10151 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010152 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010153 if (use_memcpy && last_obj != NULL) {
10154 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10155 use_memcpy = 0;
10156 }
10157 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010158 }
Tim Petersced69f82003-09-16 20:30:58 +000010159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010161 if (res == NULL)
10162 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010163
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010164 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010165#ifdef Py_DEBUG
10166 use_memcpy = 0;
10167#else
10168 if (use_memcpy) {
10169 res_data = PyUnicode_1BYTE_DATA(res);
10170 kind = PyUnicode_KIND(res);
10171 if (seplen != 0)
10172 sep_data = PyUnicode_1BYTE_DATA(sep);
10173 }
10174#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010175 if (use_memcpy) {
10176 for (i = 0; i < seqlen; ++i) {
10177 Py_ssize_t itemlen;
10178 item = items[i];
10179
10180 /* Copy item, and maybe the separator. */
10181 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010182 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010183 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010184 kind * seplen);
10185 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010186 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010187
10188 itemlen = PyUnicode_GET_LENGTH(item);
10189 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010190 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010191 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010192 kind * itemlen);
10193 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010194 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010195 }
10196 assert(res_data == PyUnicode_1BYTE_DATA(res)
10197 + kind * PyUnicode_GET_LENGTH(res));
10198 }
10199 else {
10200 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10201 Py_ssize_t itemlen;
10202 item = items[i];
10203
10204 /* Copy item, and maybe the separator. */
10205 if (i && seplen != 0) {
10206 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10207 res_offset += seplen;
10208 }
10209
10210 itemlen = PyUnicode_GET_LENGTH(item);
10211 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010212 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010213 res_offset += itemlen;
10214 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010215 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010216 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010217 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010220 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222
Benjamin Peterson29060642009-01-31 22:14:21 +000010223 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010225 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226 return NULL;
10227}
10228
Victor Stinnerd3f08822012-05-29 12:57:52 +020010229void
10230_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10231 Py_UCS4 fill_char)
10232{
10233 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010234 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010235 assert(PyUnicode_IS_READY(unicode));
10236 assert(unicode_modifiable(unicode));
10237 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10238 assert(start >= 0);
10239 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010240 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010241}
10242
Victor Stinner3fe55312012-01-04 00:33:50 +010010243Py_ssize_t
10244PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10245 Py_UCS4 fill_char)
10246{
10247 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010248
10249 if (!PyUnicode_Check(unicode)) {
10250 PyErr_BadInternalCall();
10251 return -1;
10252 }
10253 if (PyUnicode_READY(unicode) == -1)
10254 return -1;
10255 if (unicode_check_modifiable(unicode))
10256 return -1;
10257
Victor Stinnerd3f08822012-05-29 12:57:52 +020010258 if (start < 0) {
10259 PyErr_SetString(PyExc_IndexError, "string index out of range");
10260 return -1;
10261 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010262 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10263 PyErr_SetString(PyExc_ValueError,
10264 "fill character is bigger than "
10265 "the string maximum character");
10266 return -1;
10267 }
10268
10269 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10270 length = Py_MIN(maxlen, length);
10271 if (length <= 0)
10272 return 0;
10273
Victor Stinnerd3f08822012-05-29 12:57:52 +020010274 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010275 return length;
10276}
10277
Victor Stinner9310abb2011-10-05 00:59:23 +020010278static PyObject *
10279pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010280 Py_ssize_t left,
10281 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 PyObject *u;
10285 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010286 int kind;
10287 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288
10289 if (left < 0)
10290 left = 0;
10291 if (right < 0)
10292 right = 0;
10293
Victor Stinnerc4b49542011-12-11 22:44:26 +010010294 if (left == 0 && right == 0)
10295 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10298 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010299 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10300 return NULL;
10301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010303 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010305 if (!u)
10306 return NULL;
10307
10308 kind = PyUnicode_KIND(u);
10309 data = PyUnicode_DATA(u);
10310 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010311 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010312 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010313 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010314 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010315 assert(_PyUnicode_CheckConsistency(u, 1));
10316 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317}
10318
Alexander Belopolsky40018472011-02-26 01:02:56 +000010319PyObject *
10320PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010324 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326
Benjamin Petersonead6b532011-12-20 17:23:42 -060010327 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010329 if (PyUnicode_IS_ASCII(string))
10330 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 PyUnicode_GET_LENGTH(string), keepends);
10333 else
10334 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010336 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 break;
10338 case PyUnicode_2BYTE_KIND:
10339 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 PyUnicode_GET_LENGTH(string), keepends);
10342 break;
10343 case PyUnicode_4BYTE_KIND:
10344 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 PyUnicode_GET_LENGTH(string), keepends);
10347 break;
10348 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010349 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352}
10353
Alexander Belopolsky40018472011-02-26 01:02:56 +000010354static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010355split(PyObject *self,
10356 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010357 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 void *buf1, *buf2;
10361 Py_ssize_t len1, len2;
10362 PyObject* out;
10363
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010365 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 if (PyUnicode_READY(self) == -1)
10368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010371 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373 if (PyUnicode_IS_ASCII(self))
10374 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010376 PyUnicode_GET_LENGTH(self), maxcount
10377 );
10378 else
10379 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010381 PyUnicode_GET_LENGTH(self), maxcount
10382 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 case PyUnicode_2BYTE_KIND:
10384 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010385 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 PyUnicode_GET_LENGTH(self), maxcount
10387 );
10388 case PyUnicode_4BYTE_KIND:
10389 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 PyUnicode_GET_LENGTH(self), maxcount
10392 );
10393 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010394 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 }
10396
10397 if (PyUnicode_READY(substring) == -1)
10398 return NULL;
10399
10400 kind1 = PyUnicode_KIND(self);
10401 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 len1 = PyUnicode_GET_LENGTH(self);
10403 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010404 if (kind1 < kind2 || len1 < len2) {
10405 out = PyList_New(1);
10406 if (out == NULL)
10407 return NULL;
10408 Py_INCREF(self);
10409 PyList_SET_ITEM(out, 0, self);
10410 return out;
10411 }
10412 buf1 = PyUnicode_DATA(self);
10413 buf2 = PyUnicode_DATA(substring);
10414 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010415 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010416 if (!buf2)
10417 return NULL;
10418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010420 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10423 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010424 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010425 else
10426 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010427 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 break;
10429 case PyUnicode_2BYTE_KIND:
10430 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010431 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 break;
10433 case PyUnicode_4BYTE_KIND:
10434 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010435 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 break;
10437 default:
10438 out = NULL;
10439 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010440 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 PyMem_Free(buf2);
10442 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443}
10444
Alexander Belopolsky40018472011-02-26 01:02:56 +000010445static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010446rsplit(PyObject *self,
10447 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010448 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010449{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010450 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 void *buf1, *buf2;
10452 Py_ssize_t len1, len2;
10453 PyObject* out;
10454
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010455 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010456 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 if (PyUnicode_READY(self) == -1)
10459 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010462 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010464 if (PyUnicode_IS_ASCII(self))
10465 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010466 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010467 PyUnicode_GET_LENGTH(self), maxcount
10468 );
10469 else
10470 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010471 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010472 PyUnicode_GET_LENGTH(self), maxcount
10473 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 case PyUnicode_2BYTE_KIND:
10475 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010476 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 PyUnicode_GET_LENGTH(self), maxcount
10478 );
10479 case PyUnicode_4BYTE_KIND:
10480 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010481 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 PyUnicode_GET_LENGTH(self), maxcount
10483 );
10484 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010485 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 }
10487
10488 if (PyUnicode_READY(substring) == -1)
10489 return NULL;
10490
10491 kind1 = PyUnicode_KIND(self);
10492 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 len1 = PyUnicode_GET_LENGTH(self);
10494 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010495 if (kind1 < kind2 || len1 < len2) {
10496 out = PyList_New(1);
10497 if (out == NULL)
10498 return NULL;
10499 Py_INCREF(self);
10500 PyList_SET_ITEM(out, 0, self);
10501 return out;
10502 }
10503 buf1 = PyUnicode_DATA(self);
10504 buf2 = PyUnicode_DATA(substring);
10505 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010506 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010507 if (!buf2)
10508 return NULL;
10509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010511 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010513 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10514 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010515 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 else
10517 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010518 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 break;
10520 case PyUnicode_2BYTE_KIND:
10521 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010522 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 break;
10524 case PyUnicode_4BYTE_KIND:
10525 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010526 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 break;
10528 default:
10529 out = NULL;
10530 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010531 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 PyMem_Free(buf2);
10533 return out;
10534}
10535
10536static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010537anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10538 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010540 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010542 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10543 return asciilib_find(buf1, len1, buf2, len2, offset);
10544 else
10545 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 case PyUnicode_2BYTE_KIND:
10547 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10548 case PyUnicode_4BYTE_KIND:
10549 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10550 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010551 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552}
10553
10554static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10556 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010558 switch (kind) {
10559 case PyUnicode_1BYTE_KIND:
10560 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10561 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10562 else
10563 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10564 case PyUnicode_2BYTE_KIND:
10565 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10566 case PyUnicode_4BYTE_KIND:
10567 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10568 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010569 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010570}
10571
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010572static void
10573replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10574 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10575{
10576 int kind = PyUnicode_KIND(u);
10577 void *data = PyUnicode_DATA(u);
10578 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10579 if (kind == PyUnicode_1BYTE_KIND) {
10580 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10581 (Py_UCS1 *)data + len,
10582 u1, u2, maxcount);
10583 }
10584 else if (kind == PyUnicode_2BYTE_KIND) {
10585 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10586 (Py_UCS2 *)data + len,
10587 u1, u2, maxcount);
10588 }
10589 else {
10590 assert(kind == PyUnicode_4BYTE_KIND);
10591 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10592 (Py_UCS4 *)data + len,
10593 u1, u2, maxcount);
10594 }
10595}
10596
Alexander Belopolsky40018472011-02-26 01:02:56 +000010597static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598replace(PyObject *self, PyObject *str1,
10599 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 PyObject *u;
10602 char *sbuf = PyUnicode_DATA(self);
10603 char *buf1 = PyUnicode_DATA(str1);
10604 char *buf2 = PyUnicode_DATA(str2);
10605 int srelease = 0, release1 = 0, release2 = 0;
10606 int skind = PyUnicode_KIND(self);
10607 int kind1 = PyUnicode_KIND(str1);
10608 int kind2 = PyUnicode_KIND(str2);
10609 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10610 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10611 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010613 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010615 if (slen < len1)
10616 goto nothing;
10617
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010620 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010621 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622
Victor Stinner59de0ee2011-10-07 10:01:28 +020010623 if (str1 == str2)
10624 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625
Victor Stinner49a0a212011-10-12 23:46:10 +020010626 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010627 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10628 if (maxchar < maxchar_str1)
10629 /* substring too wide to be present */
10630 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010631 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10632 /* Replacing str1 with str2 may cause a maxchar reduction in the
10633 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010634 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010635 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010638 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010640 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010644 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010645
Victor Stinner69ed0f42013-04-09 21:48:24 +020010646 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010647 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010648 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010650 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010654
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010655 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10656 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 }
10658 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 int rkind = skind;
10660 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010661 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 if (kind1 < rkind) {
10664 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010665 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (!buf1) goto error;
10667 release1 = 1;
10668 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010669 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 if (i < 0)
10671 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (rkind > kind2) {
10673 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010674 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (!buf2) goto error;
10676 release2 = 1;
10677 }
10678 else if (rkind < kind2) {
10679 /* widen self and buf1 */
10680 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010681 if (release1) {
10682 PyMem_Free(buf1);
10683 buf1 = PyUnicode_DATA(str1);
10684 release1 = 0;
10685 }
10686 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (!sbuf) goto error;
10688 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010689 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (!buf1) goto error;
10691 release1 = 1;
10692 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010693 u = PyUnicode_New(slen, maxchar);
10694 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010696 assert(PyUnicode_KIND(u) == rkind);
10697 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010698
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010699 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010700 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010701 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010703 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010705
10706 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010707 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010708 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010709 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010710 if (i == -1)
10711 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010712 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010714 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010718 }
10719 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010721 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 int rkind = skind;
10723 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010726 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010727 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (!buf1) goto error;
10729 release1 = 1;
10730 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010731 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 if (n == 0)
10733 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010735 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010736 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 if (!buf2) goto error;
10738 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010741 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010743 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (!sbuf) goto error;
10745 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010746 if (release1) {
10747 PyMem_Free(buf1);
10748 buf1 = PyUnicode_DATA(str1);
10749 release1 = 0;
10750 }
10751 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 if (!buf1) goto error;
10753 release1 = 1;
10754 }
10755 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10756 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010757 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 PyErr_SetString(PyExc_OverflowError,
10759 "replace string is too long");
10760 goto error;
10761 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010762 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010763 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010764 _Py_INCREF_UNICODE_EMPTY();
10765 if (!unicode_empty)
10766 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010767 u = unicode_empty;
10768 goto done;
10769 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010770 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 PyErr_SetString(PyExc_OverflowError,
10772 "replace string is too long");
10773 goto error;
10774 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010775 u = PyUnicode_New(new_size, maxchar);
10776 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010778 assert(PyUnicode_KIND(u) == rkind);
10779 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 ires = i = 0;
10781 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010782 while (n-- > 0) {
10783 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010784 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010785 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010786 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010787 if (j == -1)
10788 break;
10789 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010790 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010791 memcpy(res + rkind * ires,
10792 sbuf + rkind * i,
10793 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010795 }
10796 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010798 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010800 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010806 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010807 memcpy(res + rkind * ires,
10808 sbuf + rkind * i,
10809 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010810 }
10811 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010812 /* interleave */
10813 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010814 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010816 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010818 if (--n <= 0)
10819 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010820 memcpy(res + rkind * ires,
10821 sbuf + rkind * i,
10822 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 ires++;
10824 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010825 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010826 memcpy(res + rkind * ires,
10827 sbuf + rkind * i,
10828 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010829 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010830 }
10831
10832 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010833 unicode_adjust_maxchar(&u);
10834 if (u == NULL)
10835 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010837
10838 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 if (srelease)
10840 PyMem_FREE(sbuf);
10841 if (release1)
10842 PyMem_FREE(buf1);
10843 if (release2)
10844 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010845 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010847
Benjamin Peterson29060642009-01-31 22:14:21 +000010848 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010849 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 if (srelease)
10851 PyMem_FREE(sbuf);
10852 if (release1)
10853 PyMem_FREE(buf1);
10854 if (release2)
10855 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010856 return unicode_result_unchanged(self);
10857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 error:
10859 if (srelease && sbuf)
10860 PyMem_FREE(sbuf);
10861 if (release1 && buf1)
10862 PyMem_FREE(buf1);
10863 if (release2 && buf2)
10864 PyMem_FREE(buf2);
10865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866}
10867
10868/* --- Unicode Object Methods --------------------------------------------- */
10869
INADA Naoki3ae20562017-01-16 20:41:20 +090010870/*[clinic input]
10871str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872
INADA Naoki3ae20562017-01-16 20:41:20 +090010873Return a version of the string where each word is titlecased.
10874
10875More specifically, words start with uppercased characters and all remaining
10876cased characters have lower case.
10877[clinic start generated code]*/
10878
10879static PyObject *
10880unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010881/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010883 if (PyUnicode_READY(self) == -1)
10884 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010885 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886}
10887
INADA Naoki3ae20562017-01-16 20:41:20 +090010888/*[clinic input]
10889str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
INADA Naoki3ae20562017-01-16 20:41:20 +090010891Return a capitalized version of the string.
10892
10893More specifically, make the first character have upper case and the rest lower
10894case.
10895[clinic start generated code]*/
10896
10897static PyObject *
10898unicode_capitalize_impl(PyObject *self)
10899/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010901 if (PyUnicode_READY(self) == -1)
10902 return NULL;
10903 if (PyUnicode_GET_LENGTH(self) == 0)
10904 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010905 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906}
10907
INADA Naoki3ae20562017-01-16 20:41:20 +090010908/*[clinic input]
10909str.casefold as unicode_casefold
10910
10911Return a version of the string suitable for caseless comparisons.
10912[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010913
10914static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010915unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010916/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010917{
10918 if (PyUnicode_READY(self) == -1)
10919 return NULL;
10920 if (PyUnicode_IS_ASCII(self))
10921 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010922 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010923}
10924
10925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010926/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010927
10928static int
10929convert_uc(PyObject *obj, void *addr)
10930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010932
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010933 if (!PyUnicode_Check(obj)) {
10934 PyErr_Format(PyExc_TypeError,
10935 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010936 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010937 return 0;
10938 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010939 if (PyUnicode_READY(obj) < 0)
10940 return 0;
10941 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010942 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010944 return 0;
10945 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010946 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010947 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010948}
10949
INADA Naoki3ae20562017-01-16 20:41:20 +090010950/*[clinic input]
10951str.center as unicode_center
10952
10953 width: Py_ssize_t
10954 fillchar: Py_UCS4 = ' '
10955 /
10956
10957Return a centered string of length width.
10958
10959Padding is done using the specified fill character (default is a space).
10960[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
10962static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010963unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10964/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010966 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
Benjamin Petersonbac79492012-01-14 13:34:47 -050010968 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 return NULL;
10970
Victor Stinnerc4b49542011-12-11 22:44:26 +010010971 if (PyUnicode_GET_LENGTH(self) >= width)
10972 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
Victor Stinnerc4b49542011-12-11 22:44:26 +010010974 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975 left = marg / 2 + (marg & width & 1);
10976
Victor Stinner9310abb2011-10-05 00:59:23 +020010977 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978}
10979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980/* This function assumes that str1 and str2 are readied by the caller. */
10981
Marc-André Lemburge5034372000-08-08 08:04:29 +000010982static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010983unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010984{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010985#define COMPARE(TYPE1, TYPE2) \
10986 do { \
10987 TYPE1* p1 = (TYPE1 *)data1; \
10988 TYPE2* p2 = (TYPE2 *)data2; \
10989 TYPE1* end = p1 + len; \
10990 Py_UCS4 c1, c2; \
10991 for (; p1 != end; p1++, p2++) { \
10992 c1 = *p1; \
10993 c2 = *p2; \
10994 if (c1 != c2) \
10995 return (c1 < c2) ? -1 : 1; \
10996 } \
10997 } \
10998 while (0)
10999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 int kind1, kind2;
11001 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011002 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 kind1 = PyUnicode_KIND(str1);
11005 kind2 = PyUnicode_KIND(str2);
11006 data1 = PyUnicode_DATA(str1);
11007 data2 = PyUnicode_DATA(str2);
11008 len1 = PyUnicode_GET_LENGTH(str1);
11009 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011010 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011011
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011012 switch(kind1) {
11013 case PyUnicode_1BYTE_KIND:
11014 {
11015 switch(kind2) {
11016 case PyUnicode_1BYTE_KIND:
11017 {
11018 int cmp = memcmp(data1, data2, len);
11019 /* normalize result of memcmp() into the range [-1; 1] */
11020 if (cmp < 0)
11021 return -1;
11022 if (cmp > 0)
11023 return 1;
11024 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011025 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011026 case PyUnicode_2BYTE_KIND:
11027 COMPARE(Py_UCS1, Py_UCS2);
11028 break;
11029 case PyUnicode_4BYTE_KIND:
11030 COMPARE(Py_UCS1, Py_UCS4);
11031 break;
11032 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011033 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011034 }
11035 break;
11036 }
11037 case PyUnicode_2BYTE_KIND:
11038 {
11039 switch(kind2) {
11040 case PyUnicode_1BYTE_KIND:
11041 COMPARE(Py_UCS2, Py_UCS1);
11042 break;
11043 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011044 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011045 COMPARE(Py_UCS2, Py_UCS2);
11046 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011047 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011048 case PyUnicode_4BYTE_KIND:
11049 COMPARE(Py_UCS2, Py_UCS4);
11050 break;
11051 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011052 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011053 }
11054 break;
11055 }
11056 case PyUnicode_4BYTE_KIND:
11057 {
11058 switch(kind2) {
11059 case PyUnicode_1BYTE_KIND:
11060 COMPARE(Py_UCS4, Py_UCS1);
11061 break;
11062 case PyUnicode_2BYTE_KIND:
11063 COMPARE(Py_UCS4, Py_UCS2);
11064 break;
11065 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011066 {
11067#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11068 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11069 /* normalize result of wmemcmp() into the range [-1; 1] */
11070 if (cmp < 0)
11071 return -1;
11072 if (cmp > 0)
11073 return 1;
11074#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011075 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011076#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011077 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011078 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011079 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011080 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011081 }
11082 break;
11083 }
11084 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011085 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011086 }
11087
Victor Stinner770e19e2012-10-04 22:59:45 +020011088 if (len1 == len2)
11089 return 0;
11090 if (len1 < len2)
11091 return -1;
11092 else
11093 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011094
11095#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011096}
11097
Benjamin Peterson621b4302016-09-09 13:54:34 -070011098static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011099unicode_compare_eq(PyObject *str1, PyObject *str2)
11100{
11101 int kind;
11102 void *data1, *data2;
11103 Py_ssize_t len;
11104 int cmp;
11105
Victor Stinnere5567ad2012-10-23 02:48:49 +020011106 len = PyUnicode_GET_LENGTH(str1);
11107 if (PyUnicode_GET_LENGTH(str2) != len)
11108 return 0;
11109 kind = PyUnicode_KIND(str1);
11110 if (PyUnicode_KIND(str2) != kind)
11111 return 0;
11112 data1 = PyUnicode_DATA(str1);
11113 data2 = PyUnicode_DATA(str2);
11114
11115 cmp = memcmp(data1, data2, len * kind);
11116 return (cmp == 0);
11117}
11118
11119
Alexander Belopolsky40018472011-02-26 01:02:56 +000011120int
11121PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11124 if (PyUnicode_READY(left) == -1 ||
11125 PyUnicode_READY(right) == -1)
11126 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011127
11128 /* a string is equal to itself */
11129 if (left == right)
11130 return 0;
11131
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011132 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011134 PyErr_Format(PyExc_TypeError,
11135 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011136 Py_TYPE(left)->tp_name,
11137 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138 return -1;
11139}
11140
Martin v. Löwis5b222132007-06-10 09:51:05 +000011141int
11142PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 Py_ssize_t i;
11145 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011147 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148
Victor Stinner910337b2011-10-03 03:20:16 +020011149 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011150 if (!PyUnicode_IS_READY(uni)) {
11151 const wchar_t *ws = _PyUnicode_WSTR(uni);
11152 /* Compare Unicode string and source character set string */
11153 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11154 if (chr != ustr[i])
11155 return (chr < ustr[i]) ? -1 : 1;
11156 }
11157 /* This check keeps Python strings that end in '\0' from comparing equal
11158 to C strings identical up to that point. */
11159 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11160 return 1; /* uni is longer */
11161 if (ustr[i])
11162 return -1; /* str is longer */
11163 return 0;
11164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011166 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011167 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011168 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011169 size_t len, len2 = strlen(str);
11170 int cmp;
11171
11172 len = Py_MIN(len1, len2);
11173 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011174 if (cmp != 0) {
11175 if (cmp < 0)
11176 return -1;
11177 else
11178 return 1;
11179 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011180 if (len1 > len2)
11181 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011182 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011183 return -1; /* str is longer */
11184 return 0;
11185 }
11186 else {
11187 void *data = PyUnicode_DATA(uni);
11188 /* Compare Unicode string and source character set string */
11189 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011190 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011191 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11192 /* This check keeps Python strings that end in '\0' from comparing equal
11193 to C strings identical up to that point. */
11194 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11195 return 1; /* uni is longer */
11196 if (str[i])
11197 return -1; /* str is longer */
11198 return 0;
11199 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011200}
11201
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011202static int
11203non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11204{
11205 size_t i, len;
11206 const wchar_t *p;
11207 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11208 if (strlen(str) != len)
11209 return 0;
11210 p = _PyUnicode_WSTR(unicode);
11211 assert(p);
11212 for (i = 0; i < len; i++) {
11213 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011214 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011215 return 0;
11216 }
11217 return 1;
11218}
11219
11220int
11221_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11222{
11223 size_t len;
11224 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011225 assert(str);
11226#ifndef NDEBUG
11227 for (const char *p = str; *p; p++) {
11228 assert((unsigned char)*p < 128);
11229 }
11230#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011231 if (PyUnicode_READY(unicode) == -1) {
11232 /* Memory error or bad data */
11233 PyErr_Clear();
11234 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11235 }
11236 if (!PyUnicode_IS_ASCII(unicode))
11237 return 0;
11238 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11239 return strlen(str) == len &&
11240 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11241}
11242
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011243int
11244_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11245{
11246 PyObject *right_uni;
11247 Py_hash_t hash;
11248
11249 assert(_PyUnicode_CHECK(left));
11250 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011251#ifndef NDEBUG
11252 for (const char *p = right->string; *p; p++) {
11253 assert((unsigned char)*p < 128);
11254 }
11255#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011256
11257 if (PyUnicode_READY(left) == -1) {
11258 /* memory error or bad data */
11259 PyErr_Clear();
11260 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11261 }
11262
11263 if (!PyUnicode_IS_ASCII(left))
11264 return 0;
11265
11266 right_uni = _PyUnicode_FromId(right); /* borrowed */
11267 if (right_uni == NULL) {
11268 /* memory error or bad data */
11269 PyErr_Clear();
11270 return _PyUnicode_EqualToASCIIString(left, right->string);
11271 }
11272
11273 if (left == right_uni)
11274 return 1;
11275
11276 if (PyUnicode_CHECK_INTERNED(left))
11277 return 0;
11278
INADA Naoki7cc95f52018-01-28 02:07:09 +090011279 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011280 hash = _PyUnicode_HASH(left);
11281 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11282 return 0;
11283
11284 return unicode_compare_eq(left, right_uni);
11285}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011286
Alexander Belopolsky40018472011-02-26 01:02:56 +000011287PyObject *
11288PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011289{
11290 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011291
Victor Stinnere5567ad2012-10-23 02:48:49 +020011292 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11293 Py_RETURN_NOTIMPLEMENTED;
11294
11295 if (PyUnicode_READY(left) == -1 ||
11296 PyUnicode_READY(right) == -1)
11297 return NULL;
11298
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011299 if (left == right) {
11300 switch (op) {
11301 case Py_EQ:
11302 case Py_LE:
11303 case Py_GE:
11304 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011305 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011306 case Py_NE:
11307 case Py_LT:
11308 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011309 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011310 default:
11311 PyErr_BadArgument();
11312 return NULL;
11313 }
11314 }
11315 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011316 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011317 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011318 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011319 }
11320 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011321 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011322 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011323 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011324}
11325
Alexander Belopolsky40018472011-02-26 01:02:56 +000011326int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011327_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11328{
11329 return unicode_eq(aa, bb);
11330}
11331
11332int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011333PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011334{
Victor Stinner77282cb2013-04-14 19:22:47 +020011335 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 void *buf1, *buf2;
11337 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011338 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011339
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011340 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011342 "'in <string>' requires string as left operand, not %.100s",
11343 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011344 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011345 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011346 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011347 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011348 if (ensure_unicode(str) < 0)
11349 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011352 kind2 = PyUnicode_KIND(substr);
11353 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011354 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011356 len2 = PyUnicode_GET_LENGTH(substr);
11357 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011358 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011359 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011360 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011361 if (len2 == 1) {
11362 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11363 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011364 return result;
11365 }
11366 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011367 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011369 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371
Victor Stinner77282cb2013-04-14 19:22:47 +020011372 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 case PyUnicode_1BYTE_KIND:
11374 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11375 break;
11376 case PyUnicode_2BYTE_KIND:
11377 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11378 break;
11379 case PyUnicode_4BYTE_KIND:
11380 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11381 break;
11382 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011383 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011385
Victor Stinner77282cb2013-04-14 19:22:47 +020011386 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 PyMem_Free(buf2);
11388
Guido van Rossum403d68b2000-03-13 15:55:09 +000011389 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011390}
11391
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392/* Concat to string or Unicode object giving a new Unicode object. */
11393
Alexander Belopolsky40018472011-02-26 01:02:56 +000011394PyObject *
11395PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011397 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011398 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011399 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011401 if (ensure_unicode(left) < 0)
11402 return NULL;
11403
11404 if (!PyUnicode_Check(right)) {
11405 PyErr_Format(PyExc_TypeError,
11406 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011407 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011408 return NULL;
11409 }
11410 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
11413 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011414 if (left == unicode_empty)
11415 return PyUnicode_FromObject(right);
11416 if (right == unicode_empty)
11417 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011419 left_len = PyUnicode_GET_LENGTH(left);
11420 right_len = PyUnicode_GET_LENGTH(right);
11421 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011422 PyErr_SetString(PyExc_OverflowError,
11423 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011424 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011425 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011426 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011427
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011428 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11429 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011430 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011433 result = PyUnicode_New(new_len, maxchar);
11434 if (result == NULL)
11435 return NULL;
11436 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11437 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11438 assert(_PyUnicode_CheckConsistency(result, 1));
11439 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440}
11441
Walter Dörwald1ab83302007-05-18 17:15:44 +000011442void
Victor Stinner23e56682011-10-03 03:54:37 +020011443PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011444{
Victor Stinner23e56682011-10-03 03:54:37 +020011445 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011446 Py_UCS4 maxchar, maxchar2;
11447 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011448
11449 if (p_left == NULL) {
11450 if (!PyErr_Occurred())
11451 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011452 return;
11453 }
Victor Stinner23e56682011-10-03 03:54:37 +020011454 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011455 if (right == NULL || left == NULL
11456 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011457 if (!PyErr_Occurred())
11458 PyErr_BadInternalCall();
11459 goto error;
11460 }
11461
Benjamin Petersonbac79492012-01-14 13:34:47 -050011462 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011463 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011464 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011465 goto error;
11466
Victor Stinner488fa492011-12-12 00:01:39 +010011467 /* Shortcuts */
11468 if (left == unicode_empty) {
11469 Py_DECREF(left);
11470 Py_INCREF(right);
11471 *p_left = right;
11472 return;
11473 }
11474 if (right == unicode_empty)
11475 return;
11476
11477 left_len = PyUnicode_GET_LENGTH(left);
11478 right_len = PyUnicode_GET_LENGTH(right);
11479 if (left_len > PY_SSIZE_T_MAX - right_len) {
11480 PyErr_SetString(PyExc_OverflowError,
11481 "strings are too large to concat");
11482 goto error;
11483 }
11484 new_len = left_len + right_len;
11485
11486 if (unicode_modifiable(left)
11487 && PyUnicode_CheckExact(right)
11488 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011489 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11490 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011491 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011492 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011493 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11494 {
11495 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011496 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011497 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011498
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011499 /* copy 'right' into the newly allocated area of 'left' */
11500 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011501 }
Victor Stinner488fa492011-12-12 00:01:39 +010011502 else {
11503 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11504 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011505 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011506
Victor Stinner488fa492011-12-12 00:01:39 +010011507 /* Concat the two Unicode strings */
11508 res = PyUnicode_New(new_len, maxchar);
11509 if (res == NULL)
11510 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011511 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11512 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011513 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011514 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011515 }
11516 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011517 return;
11518
11519error:
Victor Stinner488fa492011-12-12 00:01:39 +010011520 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011521}
11522
11523void
11524PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11525{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011526 PyUnicode_Append(pleft, right);
11527 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011528}
11529
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011530/*
11531Wraps stringlib_parse_args_finds() and additionally ensures that the
11532first argument is a unicode object.
11533*/
11534
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011535static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011536parse_args_finds_unicode(const char * function_name, PyObject *args,
11537 PyObject **substring,
11538 Py_ssize_t *start, Py_ssize_t *end)
11539{
11540 if(stringlib_parse_args_finds(function_name, args, substring,
11541 start, end)) {
11542 if (ensure_unicode(*substring) < 0)
11543 return 0;
11544 return 1;
11545 }
11546 return 0;
11547}
11548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011549PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011552Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011553string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011554interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
11556static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011557unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011559 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011560 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011561 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011563 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 void *buf1, *buf2;
11565 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011567 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 kind1 = PyUnicode_KIND(self);
11571 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011572 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011573 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 len1 = PyUnicode_GET_LENGTH(self);
11576 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011578 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011579 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011580
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011581 buf1 = PyUnicode_DATA(self);
11582 buf2 = PyUnicode_DATA(substring);
11583 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011584 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011585 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011586 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011587 }
11588 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 case PyUnicode_1BYTE_KIND:
11590 iresult = ucs1lib_count(
11591 ((Py_UCS1*)buf1) + start, end - start,
11592 buf2, len2, PY_SSIZE_T_MAX
11593 );
11594 break;
11595 case PyUnicode_2BYTE_KIND:
11596 iresult = ucs2lib_count(
11597 ((Py_UCS2*)buf1) + start, end - start,
11598 buf2, len2, PY_SSIZE_T_MAX
11599 );
11600 break;
11601 case PyUnicode_4BYTE_KIND:
11602 iresult = ucs4lib_count(
11603 ((Py_UCS4*)buf1) + start, end - start,
11604 buf2, len2, PY_SSIZE_T_MAX
11605 );
11606 break;
11607 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011608 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 }
11610
11611 result = PyLong_FromSsize_t(iresult);
11612
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011613 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616 return result;
11617}
11618
INADA Naoki3ae20562017-01-16 20:41:20 +090011619/*[clinic input]
11620str.encode as unicode_encode
11621
11622 encoding: str(c_default="NULL") = 'utf-8'
11623 The encoding in which to encode the string.
11624 errors: str(c_default="NULL") = 'strict'
11625 The error handling scheme to use for encoding errors.
11626 The default is 'strict' meaning that encoding errors raise a
11627 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11628 'xmlcharrefreplace' as well as any other name registered with
11629 codecs.register_error that can handle UnicodeEncodeErrors.
11630
11631Encode the string using the codec registered for encoding.
11632[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
11634static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011635unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011636/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011638 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011639}
11640
INADA Naoki3ae20562017-01-16 20:41:20 +090011641/*[clinic input]
11642str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643
INADA Naoki3ae20562017-01-16 20:41:20 +090011644 tabsize: int = 8
11645
11646Return a copy where all tab characters are expanded using spaces.
11647
11648If tabsize is not given, a tab size of 8 characters is assumed.
11649[clinic start generated code]*/
11650
11651static PyObject *
11652unicode_expandtabs_impl(PyObject *self, int tabsize)
11653/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011655 Py_ssize_t i, j, line_pos, src_len, incr;
11656 Py_UCS4 ch;
11657 PyObject *u;
11658 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011659 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011660 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661
Antoine Pitrou22425222011-10-04 19:10:51 +020011662 if (PyUnicode_READY(self) == -1)
11663 return NULL;
11664
Thomas Wouters7e474022000-07-16 12:04:32 +000011665 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011666 src_len = PyUnicode_GET_LENGTH(self);
11667 i = j = line_pos = 0;
11668 kind = PyUnicode_KIND(self);
11669 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011670 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011671 for (; i < src_len; i++) {
11672 ch = PyUnicode_READ(kind, src_data, i);
11673 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011674 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011676 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011678 goto overflow;
11679 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011681 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011685 goto overflow;
11686 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011688 if (ch == '\n' || ch == '\r')
11689 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011691 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011692 if (!found)
11693 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011694
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011696 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 if (!u)
11698 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011699 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
Antoine Pitroue71d5742011-10-04 15:55:09 +020011701 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Antoine Pitroue71d5742011-10-04 15:55:09 +020011703 for (; i < src_len; i++) {
11704 ch = PyUnicode_READ(kind, src_data, i);
11705 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011707 incr = tabsize - (line_pos % tabsize);
11708 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011709 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011710 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011714 line_pos++;
11715 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011716 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011717 if (ch == '\n' || ch == '\r')
11718 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011720 }
11721 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011722 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011723
Antoine Pitroue71d5742011-10-04 15:55:09 +020011724 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011725 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727}
11728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011729PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011730 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731\n\
11732Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011733such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734arguments start and end are interpreted as in slice notation.\n\
11735\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011736Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
11738static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011741 /* initialize variables to prevent gcc warning */
11742 PyObject *substring = NULL;
11743 Py_ssize_t start = 0;
11744 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011745 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011747 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011750 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011753 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (result == -2)
11756 return NULL;
11757
Christian Heimes217cfd12007-12-02 14:31:20 +000011758 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
11761static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011762unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011764 void *data;
11765 enum PyUnicode_Kind kind;
11766 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011767
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011768 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011769 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011771 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011772 if (PyUnicode_READY(self) == -1) {
11773 return NULL;
11774 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011775 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11776 PyErr_SetString(PyExc_IndexError, "string index out of range");
11777 return NULL;
11778 }
11779 kind = PyUnicode_KIND(self);
11780 data = PyUnicode_DATA(self);
11781 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011782 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783}
11784
Guido van Rossumc2504932007-09-18 19:42:40 +000011785/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011786 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011787static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011788unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011790 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011791
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011792#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011793 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011794#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 if (_PyUnicode_HASH(self) != -1)
11796 return _PyUnicode_HASH(self);
11797 if (PyUnicode_READY(self) == -1)
11798 return -1;
animalizea1d14252019-01-02 20:16:06 +080011799
Christian Heimes985ecdc2013-11-20 11:46:18 +010011800 x = _Py_HashBytes(PyUnicode_DATA(self),
11801 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011803 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804}
11805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011806PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011807 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808\n\
oldkaa0735f2018-02-02 16:52:55 +080011809Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011810such that sub is contained within S[start:end]. Optional\n\
11811arguments start and end are interpreted as in slice notation.\n\
11812\n\
11813Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
11815static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011818 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011819 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011820 PyObject *substring = NULL;
11821 Py_ssize_t start = 0;
11822 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011824 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011827 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011830 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 if (result == -2)
11833 return NULL;
11834
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 if (result < 0) {
11836 PyErr_SetString(PyExc_ValueError, "substring not found");
11837 return NULL;
11838 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011839
Christian Heimes217cfd12007-12-02 14:31:20 +000011840 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841}
11842
INADA Naoki3ae20562017-01-16 20:41:20 +090011843/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011844str.isascii as unicode_isascii
11845
11846Return True if all characters in the string are ASCII, False otherwise.
11847
11848ASCII characters have code points in the range U+0000-U+007F.
11849Empty string is ASCII too.
11850[clinic start generated code]*/
11851
11852static PyObject *
11853unicode_isascii_impl(PyObject *self)
11854/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11855{
11856 if (PyUnicode_READY(self) == -1) {
11857 return NULL;
11858 }
11859 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11860}
11861
11862/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011863str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864
INADA Naoki3ae20562017-01-16 20:41:20 +090011865Return True if the string is a lowercase string, False otherwise.
11866
11867A string is lowercase if all cased characters in the string are lowercase and
11868there is at least one cased character in the string.
11869[clinic start generated code]*/
11870
11871static PyObject *
11872unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011873/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 Py_ssize_t i, length;
11876 int kind;
11877 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878 int cased;
11879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 if (PyUnicode_READY(self) == -1)
11881 return NULL;
11882 length = PyUnicode_GET_LENGTH(self);
11883 kind = PyUnicode_KIND(self);
11884 data = PyUnicode_DATA(self);
11885
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (length == 1)
11888 return PyBool_FromLong(
11889 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011891 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011893 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011894
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 for (i = 0; i < length; i++) {
11897 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011898
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011900 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 else if (!cased && Py_UNICODE_ISLOWER(ch))
11902 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011904 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905}
11906
INADA Naoki3ae20562017-01-16 20:41:20 +090011907/*[clinic input]
11908str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
INADA Naoki3ae20562017-01-16 20:41:20 +090011910Return True if the string is an uppercase string, False otherwise.
11911
11912A string is uppercase if all cased characters in the string are uppercase and
11913there is at least one cased character in the string.
11914[clinic start generated code]*/
11915
11916static PyObject *
11917unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011918/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 Py_ssize_t i, length;
11921 int kind;
11922 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 int cased;
11924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 if (PyUnicode_READY(self) == -1)
11926 return NULL;
11927 length = PyUnicode_GET_LENGTH(self);
11928 kind = PyUnicode_KIND(self);
11929 data = PyUnicode_DATA(self);
11930
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 if (length == 1)
11933 return PyBool_FromLong(
11934 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011936 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011938 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011939
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 for (i = 0; i < length; i++) {
11942 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011943
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011945 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 else if (!cased && Py_UNICODE_ISUPPER(ch))
11947 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011949 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950}
11951
INADA Naoki3ae20562017-01-16 20:41:20 +090011952/*[clinic input]
11953str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
INADA Naoki3ae20562017-01-16 20:41:20 +090011955Return True if the string is a title-cased string, False otherwise.
11956
11957In a title-cased string, upper- and title-case characters may only
11958follow uncased characters and lowercase characters only cased ones.
11959[clinic start generated code]*/
11960
11961static PyObject *
11962unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011963/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 Py_ssize_t i, length;
11966 int kind;
11967 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 int cased, previous_is_cased;
11969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READY(self) == -1)
11971 return NULL;
11972 length = PyUnicode_GET_LENGTH(self);
11973 kind = PyUnicode_KIND(self);
11974 data = PyUnicode_DATA(self);
11975
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (length == 1) {
11978 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11979 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11980 (Py_UNICODE_ISUPPER(ch) != 0));
11981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011983 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011985 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011986
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987 cased = 0;
11988 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 for (i = 0; i < length; i++) {
11990 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011991
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11993 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011994 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 previous_is_cased = 1;
11996 cased = 1;
11997 }
11998 else if (Py_UNICODE_ISLOWER(ch)) {
11999 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012000 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 previous_is_cased = 1;
12002 cased = 1;
12003 }
12004 else
12005 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012007 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008}
12009
INADA Naoki3ae20562017-01-16 20:41:20 +090012010/*[clinic input]
12011str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
INADA Naoki3ae20562017-01-16 20:41:20 +090012013Return True if the string is a whitespace string, False otherwise.
12014
12015A string is whitespace if all characters in the string are whitespace and there
12016is at least one character in the string.
12017[clinic start generated code]*/
12018
12019static PyObject *
12020unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012021/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 Py_ssize_t i, length;
12024 int kind;
12025 void *data;
12026
12027 if (PyUnicode_READY(self) == -1)
12028 return NULL;
12029 length = PyUnicode_GET_LENGTH(self);
12030 kind = PyUnicode_KIND(self);
12031 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 if (length == 1)
12035 return PyBool_FromLong(
12036 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012038 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012040 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 for (i = 0; i < length; i++) {
12043 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012044 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012045 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012047 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048}
12049
INADA Naoki3ae20562017-01-16 20:41:20 +090012050/*[clinic input]
12051str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012052
INADA Naoki3ae20562017-01-16 20:41:20 +090012053Return True if the string is an alphabetic string, False otherwise.
12054
12055A string is alphabetic if all characters in the string are alphabetic and there
12056is at least one character in the string.
12057[clinic start generated code]*/
12058
12059static PyObject *
12060unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012061/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 Py_ssize_t i, length;
12064 int kind;
12065 void *data;
12066
12067 if (PyUnicode_READY(self) == -1)
12068 return NULL;
12069 length = PyUnicode_GET_LENGTH(self);
12070 kind = PyUnicode_KIND(self);
12071 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012072
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012073 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 if (length == 1)
12075 return PyBool_FromLong(
12076 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012077
12078 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012080 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 for (i = 0; i < length; i++) {
12083 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012084 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012085 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012086 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012087}
12088
INADA Naoki3ae20562017-01-16 20:41:20 +090012089/*[clinic input]
12090str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012091
INADA Naoki3ae20562017-01-16 20:41:20 +090012092Return True if the string is an alpha-numeric string, False otherwise.
12093
12094A string is alpha-numeric if all characters in the string are alpha-numeric and
12095there is at least one character in the string.
12096[clinic start generated code]*/
12097
12098static PyObject *
12099unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012100/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 int kind;
12103 void *data;
12104 Py_ssize_t len, i;
12105
12106 if (PyUnicode_READY(self) == -1)
12107 return NULL;
12108
12109 kind = PyUnicode_KIND(self);
12110 data = PyUnicode_DATA(self);
12111 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012112
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012113 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 if (len == 1) {
12115 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12116 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12117 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012118
12119 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012121 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 for (i = 0; i < len; i++) {
12124 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012125 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012126 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012127 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012128 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012129}
12130
INADA Naoki3ae20562017-01-16 20:41:20 +090012131/*[clinic input]
12132str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133
INADA Naoki3ae20562017-01-16 20:41:20 +090012134Return True if the string is a decimal string, False otherwise.
12135
12136A string is a decimal string if all characters in the string are decimal and
12137there is at least one character in the string.
12138[clinic start generated code]*/
12139
12140static PyObject *
12141unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012142/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 Py_ssize_t i, length;
12145 int kind;
12146 void *data;
12147
12148 if (PyUnicode_READY(self) == -1)
12149 return NULL;
12150 length = PyUnicode_GET_LENGTH(self);
12151 kind = PyUnicode_KIND(self);
12152 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (length == 1)
12156 return PyBool_FromLong(
12157 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012159 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012161 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 for (i = 0; i < length; i++) {
12164 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012165 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012167 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168}
12169
INADA Naoki3ae20562017-01-16 20:41:20 +090012170/*[clinic input]
12171str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172
INADA Naoki3ae20562017-01-16 20:41:20 +090012173Return True if the string is a digit string, False otherwise.
12174
12175A string is a digit string if all characters in the string are digits and there
12176is at least one character in the string.
12177[clinic start generated code]*/
12178
12179static PyObject *
12180unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012181/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 Py_ssize_t i, length;
12184 int kind;
12185 void *data;
12186
12187 if (PyUnicode_READY(self) == -1)
12188 return NULL;
12189 length = PyUnicode_GET_LENGTH(self);
12190 kind = PyUnicode_KIND(self);
12191 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (length == 1) {
12195 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12196 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012199 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012201 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 for (i = 0; i < length; i++) {
12204 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012205 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012207 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208}
12209
INADA Naoki3ae20562017-01-16 20:41:20 +090012210/*[clinic input]
12211str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
INADA Naoki3ae20562017-01-16 20:41:20 +090012213Return True if the string is a numeric string, False otherwise.
12214
12215A string is numeric if all characters in the string are numeric and there is at
12216least one character in the string.
12217[clinic start generated code]*/
12218
12219static PyObject *
12220unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012221/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 Py_ssize_t i, length;
12224 int kind;
12225 void *data;
12226
12227 if (PyUnicode_READY(self) == -1)
12228 return NULL;
12229 length = PyUnicode_GET_LENGTH(self);
12230 kind = PyUnicode_KIND(self);
12231 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (length == 1)
12235 return PyBool_FromLong(
12236 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012238 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012240 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 for (i = 0; i < length; i++) {
12243 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012244 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012246 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247}
12248
Martin v. Löwis47383402007-08-15 07:32:56 +000012249int
12250PyUnicode_IsIdentifier(PyObject *self)
12251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012253 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012254
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012255 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12256 if (len == 0) {
12257 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 }
12260
Hai Shi3d235f52020-02-17 21:41:15 +080012261 int kind = 0;
12262 void *data = NULL;
Andy Lester933fc53f2020-02-20 22:51:47 -060012263 const wchar_t *wstr = NULL;
12264 Py_UCS4 ch;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012265 if (ready) {
12266 kind = PyUnicode_KIND(self);
12267 data = PyUnicode_DATA(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012268 ch = PyUnicode_READ(kind, data, 0);
12269 }
12270 else {
Andy Lester933fc53f2020-02-20 22:51:47 -060012271 wstr = _PyUnicode_WSTR(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012272 ch = wstr[0];
12273 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012274 /* PEP 3131 says that the first character must be in
12275 XID_Start and subsequent characters in XID_Continue,
12276 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012277 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012278 letters, digits, underscore). However, given the current
12279 definition of XID_Start and XID_Continue, it is sufficient
12280 to check just for these, except that _ must be allowed
12281 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012282 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012283 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012284 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012285
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012286 for (i = 1; i < len; i++) {
12287 if (ready) {
12288 ch = PyUnicode_READ(kind, data, i);
12289 }
12290 else {
12291 ch = wstr[i];
12292 }
12293 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012295 }
12296 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012297 return 1;
12298}
12299
INADA Naoki3ae20562017-01-16 20:41:20 +090012300/*[clinic input]
12301str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012302
INADA Naoki3ae20562017-01-16 20:41:20 +090012303Return True if the string is a valid Python identifier, False otherwise.
12304
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012305Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012306such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012307[clinic start generated code]*/
12308
12309static PyObject *
12310unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012311/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012312{
12313 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12314}
12315
INADA Naoki3ae20562017-01-16 20:41:20 +090012316/*[clinic input]
12317str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012318
INADA Naoki3ae20562017-01-16 20:41:20 +090012319Return True if the string is printable, False otherwise.
12320
12321A string is printable if all of its characters are considered printable in
12322repr() or if it is empty.
12323[clinic start generated code]*/
12324
12325static PyObject *
12326unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012327/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 Py_ssize_t i, length;
12330 int kind;
12331 void *data;
12332
12333 if (PyUnicode_READY(self) == -1)
12334 return NULL;
12335 length = PyUnicode_GET_LENGTH(self);
12336 kind = PyUnicode_KIND(self);
12337 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012338
12339 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 if (length == 1)
12341 return PyBool_FromLong(
12342 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 for (i = 0; i < length; i++) {
12345 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012346 Py_RETURN_FALSE;
12347 }
12348 }
12349 Py_RETURN_TRUE;
12350}
12351
INADA Naoki3ae20562017-01-16 20:41:20 +090012352/*[clinic input]
12353str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354
INADA Naoki3ae20562017-01-16 20:41:20 +090012355 iterable: object
12356 /
12357
12358Concatenate any number of strings.
12359
Martin Panter91a88662017-01-24 00:30:06 +000012360The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012361The result is returned as a new string.
12362
12363Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12364[clinic start generated code]*/
12365
12366static PyObject *
12367unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012368/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369{
INADA Naoki3ae20562017-01-16 20:41:20 +090012370 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371}
12372
Martin v. Löwis18e16552006-02-15 17:27:45 +000012373static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012374unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 if (PyUnicode_READY(self) == -1)
12377 return -1;
12378 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379}
12380
INADA Naoki3ae20562017-01-16 20:41:20 +090012381/*[clinic input]
12382str.ljust as unicode_ljust
12383
12384 width: Py_ssize_t
12385 fillchar: Py_UCS4 = ' '
12386 /
12387
12388Return a left-justified string of length width.
12389
12390Padding is done using the specified fill character (default is a space).
12391[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012392
12393static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012394unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12395/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012397 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399
Victor Stinnerc4b49542011-12-11 22:44:26 +010012400 if (PyUnicode_GET_LENGTH(self) >= width)
12401 return unicode_result_unchanged(self);
12402
12403 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404}
12405
INADA Naoki3ae20562017-01-16 20:41:20 +090012406/*[clinic input]
12407str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408
INADA Naoki3ae20562017-01-16 20:41:20 +090012409Return a copy of the string converted to lowercase.
12410[clinic start generated code]*/
12411
12412static PyObject *
12413unicode_lower_impl(PyObject *self)
12414/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012416 if (PyUnicode_READY(self) == -1)
12417 return NULL;
12418 if (PyUnicode_IS_ASCII(self))
12419 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012420 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421}
12422
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012423#define LEFTSTRIP 0
12424#define RIGHTSTRIP 1
12425#define BOTHSTRIP 2
12426
12427/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012428static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012429
INADA Naoki3ae20562017-01-16 20:41:20 +090012430#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012431
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012432/* externally visible for str.strip(unicode) */
12433PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012434_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 void *data;
12437 int kind;
12438 Py_ssize_t i, j, len;
12439 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012440 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12443 return NULL;
12444
12445 kind = PyUnicode_KIND(self);
12446 data = PyUnicode_DATA(self);
12447 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012448 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12450 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012451 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012452
Benjamin Peterson14339b62009-01-31 16:36:08 +000012453 i = 0;
12454 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012455 while (i < len) {
12456 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12457 if (!BLOOM(sepmask, ch))
12458 break;
12459 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12460 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012461 i++;
12462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012463 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464
Benjamin Peterson14339b62009-01-31 16:36:08 +000012465 j = len;
12466 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012467 j--;
12468 while (j >= i) {
12469 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12470 if (!BLOOM(sepmask, ch))
12471 break;
12472 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12473 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012474 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012475 }
12476
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012478 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012479
Victor Stinner7931d9a2011-11-04 00:22:48 +010012480 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012481}
12482
12483PyObject*
12484PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12485{
12486 unsigned char *data;
12487 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012488 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489
Victor Stinnerde636f32011-10-01 03:55:54 +020012490 if (PyUnicode_READY(self) == -1)
12491 return NULL;
12492
Victor Stinner684d5fd2012-05-03 02:32:34 +020012493 length = PyUnicode_GET_LENGTH(self);
12494 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012495
Victor Stinner684d5fd2012-05-03 02:32:34 +020012496 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012497 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498
Victor Stinnerde636f32011-10-01 03:55:54 +020012499 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012500 PyErr_SetString(PyExc_IndexError, "string index out of range");
12501 return NULL;
12502 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012503 if (start >= length || end < start)
12504 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012505
Victor Stinner684d5fd2012-05-03 02:32:34 +020012506 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012507 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012508 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012509 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012510 }
12511 else {
12512 kind = PyUnicode_KIND(self);
12513 data = PyUnicode_1BYTE_DATA(self);
12514 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012515 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012516 length);
12517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012521do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 Py_ssize_t len, i, j;
12524
12525 if (PyUnicode_READY(self) == -1)
12526 return NULL;
12527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012529
Victor Stinnercc7af722013-04-09 22:39:24 +020012530 if (PyUnicode_IS_ASCII(self)) {
12531 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12532
12533 i = 0;
12534 if (striptype != RIGHTSTRIP) {
12535 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012536 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012537 if (!_Py_ascii_whitespace[ch])
12538 break;
12539 i++;
12540 }
12541 }
12542
12543 j = len;
12544 if (striptype != LEFTSTRIP) {
12545 j--;
12546 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012547 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012548 if (!_Py_ascii_whitespace[ch])
12549 break;
12550 j--;
12551 }
12552 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012553 }
12554 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012555 else {
12556 int kind = PyUnicode_KIND(self);
12557 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012558
Victor Stinnercc7af722013-04-09 22:39:24 +020012559 i = 0;
12560 if (striptype != RIGHTSTRIP) {
12561 while (i < len) {
12562 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12563 if (!Py_UNICODE_ISSPACE(ch))
12564 break;
12565 i++;
12566 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012567 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012568
12569 j = len;
12570 if (striptype != LEFTSTRIP) {
12571 j--;
12572 while (j >= i) {
12573 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12574 if (!Py_UNICODE_ISSPACE(ch))
12575 break;
12576 j--;
12577 }
12578 j++;
12579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012580 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012581
Victor Stinner7931d9a2011-11-04 00:22:48 +010012582 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583}
12584
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012585
12586static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012587do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012588{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012589 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590 if (PyUnicode_Check(sep))
12591 return _PyUnicode_XStrip(self, striptype, sep);
12592 else {
12593 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 "%s arg must be None or str",
12595 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012596 return NULL;
12597 }
12598 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012599
Benjamin Peterson14339b62009-01-31 16:36:08 +000012600 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012601}
12602
12603
INADA Naoki3ae20562017-01-16 20:41:20 +090012604/*[clinic input]
12605str.strip as unicode_strip
12606
12607 chars: object = None
12608 /
12609
Zachary Ware09895c22019-10-09 16:09:00 -050012610Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012611
12612If chars is given and not None, remove characters in chars instead.
12613[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012614
12615static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012616unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012617/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012618{
INADA Naoki3ae20562017-01-16 20:41:20 +090012619 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012620}
12621
12622
INADA Naoki3ae20562017-01-16 20:41:20 +090012623/*[clinic input]
12624str.lstrip as unicode_lstrip
12625
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012626 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012627 /
12628
12629Return a copy of the string with leading whitespace removed.
12630
12631If chars is given and not None, remove characters in chars instead.
12632[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012633
12634static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012635unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012636/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012637{
INADA Naoki3ae20562017-01-16 20:41:20 +090012638 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012639}
12640
12641
INADA Naoki3ae20562017-01-16 20:41:20 +090012642/*[clinic input]
12643str.rstrip as unicode_rstrip
12644
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012645 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012646 /
12647
12648Return a copy of the string with trailing whitespace removed.
12649
12650If chars is given and not None, remove characters in chars instead.
12651[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012652
12653static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012654unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012655/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012656{
INADA Naoki3ae20562017-01-16 20:41:20 +090012657 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012658}
12659
12660
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012662unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012664 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666
Serhiy Storchaka05997252013-01-26 12:14:02 +020012667 if (len < 1)
12668 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
Victor Stinnerc4b49542011-12-11 22:44:26 +010012670 /* no repeat, return original string */
12671 if (len == 1)
12672 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012673
Benjamin Petersonbac79492012-01-14 13:34:47 -050012674 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 return NULL;
12676
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012677 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012678 PyErr_SetString(PyExc_OverflowError,
12679 "repeated string is too long");
12680 return NULL;
12681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012683
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012684 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685 if (!u)
12686 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012687 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 if (PyUnicode_GET_LENGTH(str) == 1) {
12690 const int kind = PyUnicode_KIND(str);
12691 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012692 if (kind == PyUnicode_1BYTE_KIND) {
12693 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012694 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012695 }
12696 else if (kind == PyUnicode_2BYTE_KIND) {
12697 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012698 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012699 ucs2[n] = fill_char;
12700 } else {
12701 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12702 assert(kind == PyUnicode_4BYTE_KIND);
12703 for (n = 0; n < len; ++n)
12704 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 }
12707 else {
12708 /* number of characters copied this far */
12709 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012710 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012712 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012716 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012717 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719 }
12720
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012721 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012722 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723}
12724
Alexander Belopolsky40018472011-02-26 01:02:56 +000012725PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012726PyUnicode_Replace(PyObject *str,
12727 PyObject *substr,
12728 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012729 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012731 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12732 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012734 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735}
12736
INADA Naoki3ae20562017-01-16 20:41:20 +090012737/*[clinic input]
12738str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739
INADA Naoki3ae20562017-01-16 20:41:20 +090012740 old: unicode
12741 new: unicode
12742 count: Py_ssize_t = -1
12743 Maximum number of occurrences to replace.
12744 -1 (the default value) means replace all occurrences.
12745 /
12746
12747Return a copy with all occurrences of substring old replaced by new.
12748
12749If the optional argument count is given, only the first count occurrences are
12750replaced.
12751[clinic start generated code]*/
12752
12753static PyObject *
12754unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12755 Py_ssize_t count)
12756/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012757{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012758 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012760 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761}
12762
Alexander Belopolsky40018472011-02-26 01:02:56 +000012763static PyObject *
12764unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012766 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 Py_ssize_t isize;
12768 Py_ssize_t osize, squote, dquote, i, o;
12769 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012770 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012774 return NULL;
12775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 isize = PyUnicode_GET_LENGTH(unicode);
12777 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 /* Compute length of output, quote characters, and
12780 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012781 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 max = 127;
12783 squote = dquote = 0;
12784 ikind = PyUnicode_KIND(unicode);
12785 for (i = 0; i < isize; i++) {
12786 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012787 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012789 case '\'': squote++; break;
12790 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012792 incr = 2;
12793 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 default:
12795 /* Fast-path ASCII */
12796 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012797 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012799 ;
12800 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012803 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012805 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012807 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012809 if (osize > PY_SSIZE_T_MAX - incr) {
12810 PyErr_SetString(PyExc_OverflowError,
12811 "string is too long to generate repr");
12812 return NULL;
12813 }
12814 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 }
12816
12817 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012818 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012820 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 if (dquote)
12822 /* Both squote and dquote present. Use squote,
12823 and escape them */
12824 osize += squote;
12825 else
12826 quote = '"';
12827 }
Victor Stinner55c08782013-04-14 18:45:39 +020012828 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829
12830 repr = PyUnicode_New(osize, max);
12831 if (repr == NULL)
12832 return NULL;
12833 okind = PyUnicode_KIND(repr);
12834 odata = PyUnicode_DATA(repr);
12835
12836 PyUnicode_WRITE(okind, odata, 0, quote);
12837 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012838 if (unchanged) {
12839 _PyUnicode_FastCopyCharacters(repr, 1,
12840 unicode, 0,
12841 isize);
12842 }
12843 else {
12844 for (i = 0, o = 1; i < isize; i++) {
12845 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846
Victor Stinner55c08782013-04-14 18:45:39 +020012847 /* Escape quotes and backslashes */
12848 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012849 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012851 continue;
12852 }
12853
12854 /* Map special whitespace to '\t', \n', '\r' */
12855 if (ch == '\t') {
12856 PyUnicode_WRITE(okind, odata, o++, '\\');
12857 PyUnicode_WRITE(okind, odata, o++, 't');
12858 }
12859 else if (ch == '\n') {
12860 PyUnicode_WRITE(okind, odata, o++, '\\');
12861 PyUnicode_WRITE(okind, odata, o++, 'n');
12862 }
12863 else if (ch == '\r') {
12864 PyUnicode_WRITE(okind, odata, o++, '\\');
12865 PyUnicode_WRITE(okind, odata, o++, 'r');
12866 }
12867
12868 /* Map non-printable US ASCII to '\xhh' */
12869 else if (ch < ' ' || ch == 0x7F) {
12870 PyUnicode_WRITE(okind, odata, o++, '\\');
12871 PyUnicode_WRITE(okind, odata, o++, 'x');
12872 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12873 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12874 }
12875
12876 /* Copy ASCII characters as-is */
12877 else if (ch < 0x7F) {
12878 PyUnicode_WRITE(okind, odata, o++, ch);
12879 }
12880
12881 /* Non-ASCII characters */
12882 else {
12883 /* Map Unicode whitespace and control characters
12884 (categories Z* and C* except ASCII space)
12885 */
12886 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12887 PyUnicode_WRITE(okind, odata, o++, '\\');
12888 /* Map 8-bit characters to '\xhh' */
12889 if (ch <= 0xff) {
12890 PyUnicode_WRITE(okind, odata, o++, 'x');
12891 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12892 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12893 }
12894 /* Map 16-bit characters to '\uxxxx' */
12895 else if (ch <= 0xffff) {
12896 PyUnicode_WRITE(okind, odata, o++, 'u');
12897 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12898 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12899 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12900 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12901 }
12902 /* Map 21-bit characters to '\U00xxxxxx' */
12903 else {
12904 PyUnicode_WRITE(okind, odata, o++, 'U');
12905 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12906 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12907 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12908 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12909 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12910 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12911 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12912 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12913 }
12914 }
12915 /* Copy characters as-is */
12916 else {
12917 PyUnicode_WRITE(okind, odata, o++, ch);
12918 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012919 }
12920 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012923 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012924 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925}
12926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012927PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929\n\
12930Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012931such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932arguments start and end are interpreted as in slice notation.\n\
12933\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012934Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935
12936static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012939 /* initialize variables to prevent gcc warning */
12940 PyObject *substring = NULL;
12941 Py_ssize_t start = 0;
12942 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012943 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012945 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012948 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012951 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 if (result == -2)
12954 return NULL;
12955
Christian Heimes217cfd12007-12-02 14:31:20 +000012956 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957}
12958
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012959PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012962Return the highest index in S where substring sub is found,\n\
12963such that sub is contained within S[start:end]. Optional\n\
12964arguments start and end are interpreted as in slice notation.\n\
12965\n\
12966Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967
12968static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012971 /* initialize variables to prevent gcc warning */
12972 PyObject *substring = NULL;
12973 Py_ssize_t start = 0;
12974 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012975 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012977 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012980 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012983 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 if (result == -2)
12986 return NULL;
12987
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988 if (result < 0) {
12989 PyErr_SetString(PyExc_ValueError, "substring not found");
12990 return NULL;
12991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992
Christian Heimes217cfd12007-12-02 14:31:20 +000012993 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994}
12995
INADA Naoki3ae20562017-01-16 20:41:20 +090012996/*[clinic input]
12997str.rjust as unicode_rjust
12998
12999 width: Py_ssize_t
13000 fillchar: Py_UCS4 = ' '
13001 /
13002
13003Return a right-justified string of length width.
13004
13005Padding is done using the specified fill character (default is a space).
13006[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007
13008static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013009unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13010/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013012 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013 return NULL;
13014
Victor Stinnerc4b49542011-12-11 22:44:26 +010013015 if (PyUnicode_GET_LENGTH(self) >= width)
13016 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017
Victor Stinnerc4b49542011-12-11 22:44:26 +010013018 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019}
13020
Alexander Belopolsky40018472011-02-26 01:02:56 +000013021PyObject *
13022PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013024 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013027 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028}
13029
INADA Naoki3ae20562017-01-16 20:41:20 +090013030/*[clinic input]
13031str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032
INADA Naoki3ae20562017-01-16 20:41:20 +090013033 sep: object = None
13034 The delimiter according which to split the string.
13035 None (the default value) means split according to any whitespace,
13036 and discard empty strings from the result.
13037 maxsplit: Py_ssize_t = -1
13038 Maximum number of splits to do.
13039 -1 (the default value) means no limit.
13040
13041Return a list of the words in the string, using sep as the delimiter string.
13042[clinic start generated code]*/
13043
13044static PyObject *
13045unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13046/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047{
INADA Naoki3ae20562017-01-16 20:41:20 +090013048 if (sep == Py_None)
13049 return split(self, NULL, maxsplit);
13050 if (PyUnicode_Check(sep))
13051 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013052
Victor Stinner998b8062018-09-12 00:23:25 +020013053 PyErr_Format(PyExc_TypeError,
13054 "must be str or None, not %.100s",
13055 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057}
13058
Thomas Wouters477c8d52006-05-27 19:21:47 +000013059PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013060PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013061{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013063 int kind1, kind2;
13064 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013066
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013067 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013068 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013069
Victor Stinner14f8f022011-10-05 20:58:25 +020013070 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 len1 = PyUnicode_GET_LENGTH(str_obj);
13073 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013074 if (kind1 < kind2 || len1 < len2) {
13075 _Py_INCREF_UNICODE_EMPTY();
13076 if (!unicode_empty)
13077 out = NULL;
13078 else {
13079 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13080 Py_DECREF(unicode_empty);
13081 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013082 return out;
13083 }
13084 buf1 = PyUnicode_DATA(str_obj);
13085 buf2 = PyUnicode_DATA(sep_obj);
13086 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013087 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013088 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013089 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013092 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013094 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13095 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13096 else
13097 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 break;
13099 case PyUnicode_2BYTE_KIND:
13100 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13101 break;
13102 case PyUnicode_4BYTE_KIND:
13103 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13104 break;
13105 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013106 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013108
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013109 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013111
13112 return out;
13113}
13114
13115
13116PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013117PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013118{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013119 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013120 int kind1, kind2;
13121 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013123
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013124 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013126
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013127 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 len1 = PyUnicode_GET_LENGTH(str_obj);
13130 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013131 if (kind1 < kind2 || len1 < len2) {
13132 _Py_INCREF_UNICODE_EMPTY();
13133 if (!unicode_empty)
13134 out = NULL;
13135 else {
13136 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13137 Py_DECREF(unicode_empty);
13138 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013139 return out;
13140 }
13141 buf1 = PyUnicode_DATA(str_obj);
13142 buf2 = PyUnicode_DATA(sep_obj);
13143 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013144 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013145 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013146 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013149 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013151 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13152 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13153 else
13154 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 break;
13156 case PyUnicode_2BYTE_KIND:
13157 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13158 break;
13159 case PyUnicode_4BYTE_KIND:
13160 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13161 break;
13162 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013163 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013165
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013166 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013168
13169 return out;
13170}
13171
INADA Naoki3ae20562017-01-16 20:41:20 +090013172/*[clinic input]
13173str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013174
INADA Naoki3ae20562017-01-16 20:41:20 +090013175 sep: object
13176 /
13177
13178Partition the string into three parts using the given separator.
13179
13180This will search for the separator in the string. If the separator is found,
13181returns a 3-tuple containing the part before the separator, the separator
13182itself, and the part after it.
13183
13184If the separator is not found, returns a 3-tuple containing the original string
13185and two empty strings.
13186[clinic start generated code]*/
13187
13188static PyObject *
13189unicode_partition(PyObject *self, PyObject *sep)
13190/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013191{
INADA Naoki3ae20562017-01-16 20:41:20 +090013192 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013193}
13194
INADA Naoki3ae20562017-01-16 20:41:20 +090013195/*[clinic input]
13196str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013197
INADA Naoki3ae20562017-01-16 20:41:20 +090013198Partition the string into three parts using the given separator.
13199
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013200This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013201the separator is found, returns a 3-tuple containing the part before the
13202separator, the separator itself, and the part after it.
13203
13204If the separator is not found, returns a 3-tuple containing two empty strings
13205and the original string.
13206[clinic start generated code]*/
13207
13208static PyObject *
13209unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013210/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013211{
INADA Naoki3ae20562017-01-16 20:41:20 +090013212 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013213}
13214
Alexander Belopolsky40018472011-02-26 01:02:56 +000013215PyObject *
13216PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013217{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013218 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013219 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013220
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013221 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013222}
13223
INADA Naoki3ae20562017-01-16 20:41:20 +090013224/*[clinic input]
13225str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013226
INADA Naoki3ae20562017-01-16 20:41:20 +090013227Return a list of the words in the string, using sep as the delimiter string.
13228
13229Splits are done starting at the end of the string and working to the front.
13230[clinic start generated code]*/
13231
13232static PyObject *
13233unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13234/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013235{
INADA Naoki3ae20562017-01-16 20:41:20 +090013236 if (sep == Py_None)
13237 return rsplit(self, NULL, maxsplit);
13238 if (PyUnicode_Check(sep))
13239 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013240
Victor Stinner998b8062018-09-12 00:23:25 +020013241 PyErr_Format(PyExc_TypeError,
13242 "must be str or None, not %.100s",
13243 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013244 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013245}
13246
INADA Naoki3ae20562017-01-16 20:41:20 +090013247/*[clinic input]
13248str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013250 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013251
13252Return a list of the lines in the string, breaking at line boundaries.
13253
13254Line breaks are not included in the resulting list unless keepends is given and
13255true.
13256[clinic start generated code]*/
13257
13258static PyObject *
13259unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013260/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013262 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263}
13264
13265static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013266PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013268 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269}
13270
INADA Naoki3ae20562017-01-16 20:41:20 +090013271/*[clinic input]
13272str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273
INADA Naoki3ae20562017-01-16 20:41:20 +090013274Convert uppercase characters to lowercase and lowercase characters to uppercase.
13275[clinic start generated code]*/
13276
13277static PyObject *
13278unicode_swapcase_impl(PyObject *self)
13279/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013281 if (PyUnicode_READY(self) == -1)
13282 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013283 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284}
13285
Larry Hastings61272b72014-01-07 12:41:53 -080013286/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013287
Larry Hastings31826802013-10-19 00:09:25 -070013288@staticmethod
13289str.maketrans as unicode_maketrans
13290
13291 x: object
13292
13293 y: unicode=NULL
13294
13295 z: unicode=NULL
13296
13297 /
13298
13299Return a translation table usable for str.translate().
13300
13301If there is only one argument, it must be a dictionary mapping Unicode
13302ordinals (integers) or characters to Unicode ordinals, strings or None.
13303Character keys will be then converted to ordinals.
13304If there are two arguments, they must be strings of equal length, and
13305in the resulting dictionary, each character in x will be mapped to the
13306character at the same position in y. If there is a third argument, it
13307must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013308[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013309
Larry Hastings31826802013-10-19 00:09:25 -070013310static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013311unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013312/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013313{
Georg Brandlceee0772007-11-27 23:48:05 +000013314 PyObject *new = NULL, *key, *value;
13315 Py_ssize_t i = 0;
13316 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013317
Georg Brandlceee0772007-11-27 23:48:05 +000013318 new = PyDict_New();
13319 if (!new)
13320 return NULL;
13321 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 int x_kind, y_kind, z_kind;
13323 void *x_data, *y_data, *z_data;
13324
Georg Brandlceee0772007-11-27 23:48:05 +000013325 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013326 if (!PyUnicode_Check(x)) {
13327 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13328 "be a string if there is a second argument");
13329 goto err;
13330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013332 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13333 "arguments must have equal length");
13334 goto err;
13335 }
13336 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 x_kind = PyUnicode_KIND(x);
13338 y_kind = PyUnicode_KIND(y);
13339 x_data = PyUnicode_DATA(x);
13340 y_data = PyUnicode_DATA(y);
13341 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13342 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013343 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013344 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013345 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013346 if (!value) {
13347 Py_DECREF(key);
13348 goto err;
13349 }
Georg Brandlceee0772007-11-27 23:48:05 +000013350 res = PyDict_SetItem(new, key, value);
13351 Py_DECREF(key);
13352 Py_DECREF(value);
13353 if (res < 0)
13354 goto err;
13355 }
13356 /* create entries for deleting chars in z */
13357 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 z_kind = PyUnicode_KIND(z);
13359 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013360 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013362 if (!key)
13363 goto err;
13364 res = PyDict_SetItem(new, key, Py_None);
13365 Py_DECREF(key);
13366 if (res < 0)
13367 goto err;
13368 }
13369 }
13370 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 int kind;
13372 void *data;
13373
Georg Brandlceee0772007-11-27 23:48:05 +000013374 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013375 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013376 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13377 "to maketrans it must be a dict");
13378 goto err;
13379 }
13380 /* copy entries into the new dict, converting string keys to int keys */
13381 while (PyDict_Next(x, &i, &key, &value)) {
13382 if (PyUnicode_Check(key)) {
13383 /* convert string keys to integer keys */
13384 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013385 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013386 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13387 "table must be of length 1");
13388 goto err;
13389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 kind = PyUnicode_KIND(key);
13391 data = PyUnicode_DATA(key);
13392 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013393 if (!newkey)
13394 goto err;
13395 res = PyDict_SetItem(new, newkey, value);
13396 Py_DECREF(newkey);
13397 if (res < 0)
13398 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013399 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013400 /* just keep integer keys */
13401 if (PyDict_SetItem(new, key, value) < 0)
13402 goto err;
13403 } else {
13404 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13405 "be strings or integers");
13406 goto err;
13407 }
13408 }
13409 }
13410 return new;
13411 err:
13412 Py_DECREF(new);
13413 return NULL;
13414}
13415
INADA Naoki3ae20562017-01-16 20:41:20 +090013416/*[clinic input]
13417str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418
INADA Naoki3ae20562017-01-16 20:41:20 +090013419 table: object
13420 Translation table, which must be a mapping of Unicode ordinals to
13421 Unicode ordinals, strings, or None.
13422 /
13423
13424Replace each character in the string using the given translation table.
13425
13426The table must implement lookup/indexing via __getitem__, for instance a
13427dictionary or list. If this operation raises LookupError, the character is
13428left untouched. Characters mapped to None are deleted.
13429[clinic start generated code]*/
13430
13431static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013432unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013433/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436}
13437
INADA Naoki3ae20562017-01-16 20:41:20 +090013438/*[clinic input]
13439str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440
INADA Naoki3ae20562017-01-16 20:41:20 +090013441Return a copy of the string converted to uppercase.
13442[clinic start generated code]*/
13443
13444static PyObject *
13445unicode_upper_impl(PyObject *self)
13446/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013448 if (PyUnicode_READY(self) == -1)
13449 return NULL;
13450 if (PyUnicode_IS_ASCII(self))
13451 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013452 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453}
13454
INADA Naoki3ae20562017-01-16 20:41:20 +090013455/*[clinic input]
13456str.zfill as unicode_zfill
13457
13458 width: Py_ssize_t
13459 /
13460
13461Pad a numeric string with zeros on the left, to fill a field of the given width.
13462
13463The string is never truncated.
13464[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465
13466static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013467unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013468/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013469{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013470 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013471 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 int kind;
13473 void *data;
13474 Py_UCS4 chr;
13475
Benjamin Petersonbac79492012-01-14 13:34:47 -050013476 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013478
Victor Stinnerc4b49542011-12-11 22:44:26 +010013479 if (PyUnicode_GET_LENGTH(self) >= width)
13480 return unicode_result_unchanged(self);
13481
13482 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013483
13484 u = pad(self, fill, 0, '0');
13485
Walter Dörwald068325e2002-04-15 13:36:47 +000013486 if (u == NULL)
13487 return NULL;
13488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 kind = PyUnicode_KIND(u);
13490 data = PyUnicode_DATA(u);
13491 chr = PyUnicode_READ(kind, data, fill);
13492
13493 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013494 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 PyUnicode_WRITE(kind, data, 0, chr);
13496 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013497 }
13498
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013499 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013500 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013501}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502
13503#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013504static PyObject *
13505unicode__decimal2ascii(PyObject *self)
13506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013507 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013508}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013509#endif
13510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013511PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013513\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013514Return True if S starts with the specified prefix, False otherwise.\n\
13515With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013516With optional end, stop comparing S at that position.\n\
13517prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518
13519static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013520unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013522{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013523 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013524 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013525 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013526 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013527 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013528
Jesus Ceaac451502011-04-20 17:09:23 +020013529 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013530 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013531 if (PyTuple_Check(subobj)) {
13532 Py_ssize_t i;
13533 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013534 substring = PyTuple_GET_ITEM(subobj, i);
13535 if (!PyUnicode_Check(substring)) {
13536 PyErr_Format(PyExc_TypeError,
13537 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013538 "not %.100s",
13539 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013540 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013541 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013542 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013543 if (result == -1)
13544 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013545 if (result) {
13546 Py_RETURN_TRUE;
13547 }
13548 }
13549 /* nothing matched */
13550 Py_RETURN_FALSE;
13551 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013552 if (!PyUnicode_Check(subobj)) {
13553 PyErr_Format(PyExc_TypeError,
13554 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013555 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013557 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013558 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013559 if (result == -1)
13560 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013561 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013562}
13563
13564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013565PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013567\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013568Return True if S ends with the specified suffix, False otherwise.\n\
13569With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013570With optional end, stop comparing S at that position.\n\
13571suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572
13573static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013574unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013577 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013578 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013579 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013580 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013581 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582
Jesus Ceaac451502011-04-20 17:09:23 +020013583 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013585 if (PyTuple_Check(subobj)) {
13586 Py_ssize_t i;
13587 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013588 substring = PyTuple_GET_ITEM(subobj, i);
13589 if (!PyUnicode_Check(substring)) {
13590 PyErr_Format(PyExc_TypeError,
13591 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013592 "not %.100s",
13593 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013595 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013596 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013597 if (result == -1)
13598 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013599 if (result) {
13600 Py_RETURN_TRUE;
13601 }
13602 }
13603 Py_RETURN_FALSE;
13604 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013605 if (!PyUnicode_Check(subobj)) {
13606 PyErr_Format(PyExc_TypeError,
13607 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013608 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013610 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013611 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013612 if (result == -1)
13613 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013614 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013615}
13616
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013617static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013618_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013619{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013620 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13621 writer->data = PyUnicode_DATA(writer->buffer);
13622
13623 if (!writer->readonly) {
13624 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013625 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013626 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013627 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013628 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13629 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13630 writer->kind = PyUnicode_WCHAR_KIND;
13631 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13632
Victor Stinner8f674cc2013-04-17 23:02:17 +020013633 /* Copy-on-write mode: set buffer size to 0 so
13634 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13635 * next write. */
13636 writer->size = 0;
13637 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013638}
13639
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013641_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013642{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013643 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013644
13645 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013646 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013647
13648 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13649 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13650 writer->kind = PyUnicode_WCHAR_KIND;
13651 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013652}
13653
Inada Naoki770847a2019-06-24 12:30:24 +090013654// Initialize _PyUnicodeWriter with initial buffer
13655static inline void
13656_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13657{
13658 memset(writer, 0, sizeof(*writer));
13659 writer->buffer = buffer;
13660 _PyUnicodeWriter_Update(writer);
13661 writer->min_length = writer->size;
13662}
13663
Victor Stinnerd3f08822012-05-29 12:57:52 +020013664int
13665_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13666 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013667{
13668 Py_ssize_t newlen;
13669 PyObject *newbuffer;
13670
Victor Stinner2740e462016-09-06 16:58:36 -070013671 assert(maxchar <= MAX_UNICODE);
13672
Victor Stinnerca9381e2015-09-22 00:58:32 +020013673 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013674 assert((maxchar > writer->maxchar && length >= 0)
13675 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013676
Victor Stinner202fdca2012-05-07 12:47:02 +020013677 if (length > PY_SSIZE_T_MAX - writer->pos) {
13678 PyErr_NoMemory();
13679 return -1;
13680 }
13681 newlen = writer->pos + length;
13682
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013683 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013684
Victor Stinnerd3f08822012-05-29 12:57:52 +020013685 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013686 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013687 if (writer->overallocate
13688 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13689 /* overallocate to limit the number of realloc() */
13690 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013691 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013692 if (newlen < writer->min_length)
13693 newlen = writer->min_length;
13694
Victor Stinnerd3f08822012-05-29 12:57:52 +020013695 writer->buffer = PyUnicode_New(newlen, maxchar);
13696 if (writer->buffer == NULL)
13697 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013698 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013699 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013700 if (writer->overallocate
13701 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13702 /* overallocate to limit the number of realloc() */
13703 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013704 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013705 if (newlen < writer->min_length)
13706 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013707
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013708 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013709 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013710 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013711 newbuffer = PyUnicode_New(newlen, maxchar);
13712 if (newbuffer == NULL)
13713 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013714 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13715 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013716 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013717 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013718 }
13719 else {
13720 newbuffer = resize_compact(writer->buffer, newlen);
13721 if (newbuffer == NULL)
13722 return -1;
13723 }
13724 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013725 }
13726 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013727 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013728 newbuffer = PyUnicode_New(writer->size, maxchar);
13729 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013730 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013731 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13732 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013733 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013734 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013735 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013736 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013737
13738#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013739}
13740
Victor Stinnerca9381e2015-09-22 00:58:32 +020013741int
13742_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13743 enum PyUnicode_Kind kind)
13744{
13745 Py_UCS4 maxchar;
13746
13747 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13748 assert(writer->kind < kind);
13749
13750 switch (kind)
13751 {
13752 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13753 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13754 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13755 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013756 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013757 }
13758
13759 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13760}
13761
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013762static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013763_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013764{
Victor Stinner2740e462016-09-06 16:58:36 -070013765 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013766 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13767 return -1;
13768 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13769 writer->pos++;
13770 return 0;
13771}
13772
13773int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013774_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13775{
13776 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13777}
13778
13779int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013780_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13781{
13782 Py_UCS4 maxchar;
13783 Py_ssize_t len;
13784
13785 if (PyUnicode_READY(str) == -1)
13786 return -1;
13787 len = PyUnicode_GET_LENGTH(str);
13788 if (len == 0)
13789 return 0;
13790 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13791 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013792 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013793 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013794 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013795 Py_INCREF(str);
13796 writer->buffer = str;
13797 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013798 writer->pos += len;
13799 return 0;
13800 }
13801 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13802 return -1;
13803 }
13804 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13805 str, 0, len);
13806 writer->pos += len;
13807 return 0;
13808}
13809
Victor Stinnere215d962012-10-06 23:03:36 +020013810int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013811_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13812 Py_ssize_t start, Py_ssize_t end)
13813{
13814 Py_UCS4 maxchar;
13815 Py_ssize_t len;
13816
13817 if (PyUnicode_READY(str) == -1)
13818 return -1;
13819
13820 assert(0 <= start);
13821 assert(end <= PyUnicode_GET_LENGTH(str));
13822 assert(start <= end);
13823
13824 if (end == 0)
13825 return 0;
13826
13827 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13828 return _PyUnicodeWriter_WriteStr(writer, str);
13829
13830 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13831 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13832 else
13833 maxchar = writer->maxchar;
13834 len = end - start;
13835
13836 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13837 return -1;
13838
13839 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13840 str, start, len);
13841 writer->pos += len;
13842 return 0;
13843}
13844
13845int
Victor Stinner4a587072013-11-19 12:54:53 +010013846_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13847 const char *ascii, Py_ssize_t len)
13848{
13849 if (len == -1)
13850 len = strlen(ascii);
13851
Andy Lestere6be9b52020-02-11 20:28:35 -060013852 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010013853
13854 if (writer->buffer == NULL && !writer->overallocate) {
13855 PyObject *str;
13856
13857 str = _PyUnicode_FromASCII(ascii, len);
13858 if (str == NULL)
13859 return -1;
13860
13861 writer->readonly = 1;
13862 writer->buffer = str;
13863 _PyUnicodeWriter_Update(writer);
13864 writer->pos += len;
13865 return 0;
13866 }
13867
13868 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13869 return -1;
13870
13871 switch (writer->kind)
13872 {
13873 case PyUnicode_1BYTE_KIND:
13874 {
13875 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13876 Py_UCS1 *data = writer->data;
13877
Christian Heimesf051e432016-09-13 20:22:02 +020013878 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013879 break;
13880 }
13881 case PyUnicode_2BYTE_KIND:
13882 {
13883 _PyUnicode_CONVERT_BYTES(
13884 Py_UCS1, Py_UCS2,
13885 ascii, ascii + len,
13886 (Py_UCS2 *)writer->data + writer->pos);
13887 break;
13888 }
13889 case PyUnicode_4BYTE_KIND:
13890 {
13891 _PyUnicode_CONVERT_BYTES(
13892 Py_UCS1, Py_UCS4,
13893 ascii, ascii + len,
13894 (Py_UCS4 *)writer->data + writer->pos);
13895 break;
13896 }
13897 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013898 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013899 }
13900
13901 writer->pos += len;
13902 return 0;
13903}
13904
13905int
13906_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13907 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013908{
13909 Py_UCS4 maxchar;
13910
Andy Lestere6be9b52020-02-11 20:28:35 -060013911 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020013912 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13913 return -1;
13914 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13915 writer->pos += len;
13916 return 0;
13917}
13918
Victor Stinnerd3f08822012-05-29 12:57:52 +020013919PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013920_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013921{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013922 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013923
Victor Stinnerd3f08822012-05-29 12:57:52 +020013924 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013925 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013926 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013927 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013928
13929 str = writer->buffer;
13930 writer->buffer = NULL;
13931
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013932 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013933 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13934 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013935 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013936
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013937 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13938 PyObject *str2;
13939 str2 = resize_compact(str, writer->pos);
13940 if (str2 == NULL) {
13941 Py_DECREF(str);
13942 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013943 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013944 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013945 }
13946
Victor Stinner15a0bd32013-07-08 22:29:55 +020013947 assert(_PyUnicode_CheckConsistency(str, 1));
13948 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013949}
13950
Victor Stinnerd3f08822012-05-29 12:57:52 +020013951void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013952_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013953{
13954 Py_CLEAR(writer->buffer);
13955}
13956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013957#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013958
13959PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013960 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013961\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013962Return a formatted version of S, using substitutions from args and kwargs.\n\
13963The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013964
Eric Smith27bbca62010-11-04 17:06:58 +000013965PyDoc_STRVAR(format_map__doc__,
13966 "S.format_map(mapping) -> str\n\
13967\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013968Return a formatted version of S, using substitutions from mapping.\n\
13969The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013970
INADA Naoki3ae20562017-01-16 20:41:20 +090013971/*[clinic input]
13972str.__format__ as unicode___format__
13973
13974 format_spec: unicode
13975 /
13976
13977Return a formatted version of the string as described by format_spec.
13978[clinic start generated code]*/
13979
Eric Smith4a7d76d2008-05-30 18:10:19 +000013980static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013981unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013982/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013983{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013984 _PyUnicodeWriter writer;
13985 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013986
Victor Stinnerd3f08822012-05-29 12:57:52 +020013987 if (PyUnicode_READY(self) == -1)
13988 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013989 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013990 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13991 self, format_spec, 0,
13992 PyUnicode_GET_LENGTH(format_spec));
13993 if (ret == -1) {
13994 _PyUnicodeWriter_Dealloc(&writer);
13995 return NULL;
13996 }
13997 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013998}
13999
INADA Naoki3ae20562017-01-16 20:41:20 +090014000/*[clinic input]
14001str.__sizeof__ as unicode_sizeof
14002
14003Return the size of the string in memory, in bytes.
14004[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014005
14006static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014007unicode_sizeof_impl(PyObject *self)
14008/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014010 Py_ssize_t size;
14011
14012 /* If it's a compact object, account for base structure +
14013 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014014 if (PyUnicode_IS_COMPACT_ASCII(self))
14015 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14016 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014017 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014018 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014019 else {
14020 /* If it is a two-block object, account for base object, and
14021 for character block if present. */
14022 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014023 if (_PyUnicode_DATA_ANY(self))
14024 size += (PyUnicode_GET_LENGTH(self) + 1) *
14025 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014026 }
14027 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014028 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014029 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14030 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14031 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14032 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014033
14034 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014035}
14036
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014037static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014038unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014039{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014040 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014041 if (!copy)
14042 return NULL;
14043 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014044}
14045
Guido van Rossumd57fd912000-03-10 22:53:23 +000014046static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014047 UNICODE_ENCODE_METHODDEF
14048 UNICODE_REPLACE_METHODDEF
14049 UNICODE_SPLIT_METHODDEF
14050 UNICODE_RSPLIT_METHODDEF
14051 UNICODE_JOIN_METHODDEF
14052 UNICODE_CAPITALIZE_METHODDEF
14053 UNICODE_CASEFOLD_METHODDEF
14054 UNICODE_TITLE_METHODDEF
14055 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014056 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014057 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014058 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014059 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014060 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014061 UNICODE_LJUST_METHODDEF
14062 UNICODE_LOWER_METHODDEF
14063 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014064 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14065 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014066 UNICODE_RJUST_METHODDEF
14067 UNICODE_RSTRIP_METHODDEF
14068 UNICODE_RPARTITION_METHODDEF
14069 UNICODE_SPLITLINES_METHODDEF
14070 UNICODE_STRIP_METHODDEF
14071 UNICODE_SWAPCASE_METHODDEF
14072 UNICODE_TRANSLATE_METHODDEF
14073 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014074 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14075 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014076 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014077 UNICODE_ISLOWER_METHODDEF
14078 UNICODE_ISUPPER_METHODDEF
14079 UNICODE_ISTITLE_METHODDEF
14080 UNICODE_ISSPACE_METHODDEF
14081 UNICODE_ISDECIMAL_METHODDEF
14082 UNICODE_ISDIGIT_METHODDEF
14083 UNICODE_ISNUMERIC_METHODDEF
14084 UNICODE_ISALPHA_METHODDEF
14085 UNICODE_ISALNUM_METHODDEF
14086 UNICODE_ISIDENTIFIER_METHODDEF
14087 UNICODE_ISPRINTABLE_METHODDEF
14088 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014089 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014090 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014091 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014092 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014093 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014094#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014095 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014096 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097#endif
14098
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014099 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100 {NULL, NULL}
14101};
14102
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014103static PyObject *
14104unicode_mod(PyObject *v, PyObject *w)
14105{
Brian Curtindfc80e32011-08-10 20:28:54 -050014106 if (!PyUnicode_Check(v))
14107 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014108 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014109}
14110
14111static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014112 0, /*nb_add*/
14113 0, /*nb_subtract*/
14114 0, /*nb_multiply*/
14115 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014116};
14117
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014119 (lenfunc) unicode_length, /* sq_length */
14120 PyUnicode_Concat, /* sq_concat */
14121 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14122 (ssizeargfunc) unicode_getitem, /* sq_item */
14123 0, /* sq_slice */
14124 0, /* sq_ass_item */
14125 0, /* sq_ass_slice */
14126 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014127};
14128
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014129static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014130unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014132 if (PyUnicode_READY(self) == -1)
14133 return NULL;
14134
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014135 if (PyIndex_Check(item)) {
14136 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014137 if (i == -1 && PyErr_Occurred())
14138 return NULL;
14139 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014140 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014141 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014142 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014143 Py_ssize_t start, stop, step, slicelength, i;
14144 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014145 PyObject *result;
14146 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014147 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014148 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014149
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014150 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014151 return NULL;
14152 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014153 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14154 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014155
14156 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014157 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014158 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014159 slicelength == PyUnicode_GET_LENGTH(self)) {
14160 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014161 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014162 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014163 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014164 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014165 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014166 src_kind = PyUnicode_KIND(self);
14167 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014168 if (!PyUnicode_IS_ASCII(self)) {
14169 kind_limit = kind_maxchar_limit(src_kind);
14170 max_char = 0;
14171 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14172 ch = PyUnicode_READ(src_kind, src_data, cur);
14173 if (ch > max_char) {
14174 max_char = ch;
14175 if (max_char >= kind_limit)
14176 break;
14177 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014178 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014179 }
Victor Stinner55c99112011-10-13 01:17:06 +020014180 else
14181 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014182 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014183 if (result == NULL)
14184 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014185 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014186 dest_data = PyUnicode_DATA(result);
14187
14188 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014189 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14190 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014191 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014192 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014193 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014194 } else {
14195 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14196 return NULL;
14197 }
14198}
14199
14200static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014201 (lenfunc)unicode_length, /* mp_length */
14202 (binaryfunc)unicode_subscript, /* mp_subscript */
14203 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014204};
14205
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206
Guido van Rossumd57fd912000-03-10 22:53:23 +000014207/* Helpers for PyUnicode_Format() */
14208
Victor Stinnera47082312012-10-04 02:19:54 +020014209struct unicode_formatter_t {
14210 PyObject *args;
14211 int args_owned;
14212 Py_ssize_t arglen, argidx;
14213 PyObject *dict;
14214
14215 enum PyUnicode_Kind fmtkind;
14216 Py_ssize_t fmtcnt, fmtpos;
14217 void *fmtdata;
14218 PyObject *fmtstr;
14219
14220 _PyUnicodeWriter writer;
14221};
14222
14223struct unicode_format_arg_t {
14224 Py_UCS4 ch;
14225 int flags;
14226 Py_ssize_t width;
14227 int prec;
14228 int sign;
14229};
14230
Guido van Rossumd57fd912000-03-10 22:53:23 +000014231static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014232unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014233{
Victor Stinnera47082312012-10-04 02:19:54 +020014234 Py_ssize_t argidx = ctx->argidx;
14235
14236 if (argidx < ctx->arglen) {
14237 ctx->argidx++;
14238 if (ctx->arglen < 0)
14239 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014240 else
Victor Stinnera47082312012-10-04 02:19:54 +020014241 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242 }
14243 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014244 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014245 return NULL;
14246}
14247
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014248/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014249
Victor Stinnera47082312012-10-04 02:19:54 +020014250/* Format a float into the writer if the writer is not NULL, or into *p_output
14251 otherwise.
14252
14253 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014254static int
Victor Stinnera47082312012-10-04 02:19:54 +020014255formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14256 PyObject **p_output,
14257 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014258{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014259 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014260 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014261 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014262 int prec;
14263 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014264
Guido van Rossumd57fd912000-03-10 22:53:23 +000014265 x = PyFloat_AsDouble(v);
14266 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014267 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014268
Victor Stinnera47082312012-10-04 02:19:54 +020014269 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014270 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014271 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014272
Victor Stinnera47082312012-10-04 02:19:54 +020014273 if (arg->flags & F_ALT)
14274 dtoa_flags = Py_DTSF_ALT;
14275 else
14276 dtoa_flags = 0;
14277 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014278 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014279 return -1;
14280 len = strlen(p);
14281 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014282 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014283 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014284 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014285 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014286 }
14287 else
14288 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014289 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014290 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014291}
14292
Victor Stinnerd0880d52012-04-27 23:40:13 +020014293/* formatlong() emulates the format codes d, u, o, x and X, and
14294 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14295 * Python's regular ints.
14296 * Return value: a new PyUnicodeObject*, or NULL if error.
14297 * The output string is of the form
14298 * "-"? ("0x" | "0X")? digit+
14299 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14300 * set in flags. The case of hex digits will be correct,
14301 * There will be at least prec digits, zero-filled on the left if
14302 * necessary to get that many.
14303 * val object to be converted
14304 * flags bitmask of format flags; only F_ALT is looked at
14305 * prec minimum number of digits; 0-fill on left if needed
14306 * type a character in [duoxX]; u acts the same as d
14307 *
14308 * CAUTION: o, x and X conversions on regular ints can never
14309 * produce a '-' sign, but can for Python's unbounded ints.
14310 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014311PyObject *
14312_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014313{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014314 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014315 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014316 Py_ssize_t i;
14317 int sign; /* 1 if '-', else 0 */
14318 int len; /* number of characters */
14319 Py_ssize_t llen;
14320 int numdigits; /* len == numnondigits + numdigits */
14321 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014322
Victor Stinnerd0880d52012-04-27 23:40:13 +020014323 /* Avoid exceeding SSIZE_T_MAX */
14324 if (prec > INT_MAX-3) {
14325 PyErr_SetString(PyExc_OverflowError,
14326 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014327 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014328 }
14329
14330 assert(PyLong_Check(val));
14331
14332 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014333 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014334 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014335 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014336 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014337 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014338 /* int and int subclasses should print numerically when a numeric */
14339 /* format code is used (see issue18780) */
14340 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014341 break;
14342 case 'o':
14343 numnondigits = 2;
14344 result = PyNumber_ToBase(val, 8);
14345 break;
14346 case 'x':
14347 case 'X':
14348 numnondigits = 2;
14349 result = PyNumber_ToBase(val, 16);
14350 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014351 }
14352 if (!result)
14353 return NULL;
14354
14355 assert(unicode_modifiable(result));
14356 assert(PyUnicode_IS_READY(result));
14357 assert(PyUnicode_IS_ASCII(result));
14358
14359 /* To modify the string in-place, there can only be one reference. */
14360 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014361 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014362 PyErr_BadInternalCall();
14363 return NULL;
14364 }
14365 buf = PyUnicode_DATA(result);
14366 llen = PyUnicode_GET_LENGTH(result);
14367 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014368 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014369 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014370 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014371 return NULL;
14372 }
14373 len = (int)llen;
14374 sign = buf[0] == '-';
14375 numnondigits += sign;
14376 numdigits = len - numnondigits;
14377 assert(numdigits > 0);
14378
14379 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014380 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014381 (type == 'o' || type == 'x' || type == 'X'))) {
14382 assert(buf[sign] == '0');
14383 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14384 buf[sign+1] == 'o');
14385 numnondigits -= 2;
14386 buf += 2;
14387 len -= 2;
14388 if (sign)
14389 buf[0] = '-';
14390 assert(len == numnondigits + numdigits);
14391 assert(numdigits > 0);
14392 }
14393
14394 /* Fill with leading zeroes to meet minimum width. */
14395 if (prec > numdigits) {
14396 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14397 numnondigits + prec);
14398 char *b1;
14399 if (!r1) {
14400 Py_DECREF(result);
14401 return NULL;
14402 }
14403 b1 = PyBytes_AS_STRING(r1);
14404 for (i = 0; i < numnondigits; ++i)
14405 *b1++ = *buf++;
14406 for (i = 0; i < prec - numdigits; i++)
14407 *b1++ = '0';
14408 for (i = 0; i < numdigits; i++)
14409 *b1++ = *buf++;
14410 *b1 = '\0';
14411 Py_DECREF(result);
14412 result = r1;
14413 buf = PyBytes_AS_STRING(result);
14414 len = numnondigits + prec;
14415 }
14416
14417 /* Fix up case for hex conversions. */
14418 if (type == 'X') {
14419 /* Need to convert all lower case letters to upper case.
14420 and need to convert 0x to 0X (and -0x to -0X). */
14421 for (i = 0; i < len; i++)
14422 if (buf[i] >= 'a' && buf[i] <= 'x')
14423 buf[i] -= 'a'-'A';
14424 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014425 if (!PyUnicode_Check(result)
14426 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014427 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014428 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014429 Py_DECREF(result);
14430 result = unicode;
14431 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014432 else if (len != PyUnicode_GET_LENGTH(result)) {
14433 if (PyUnicode_Resize(&result, len) < 0)
14434 Py_CLEAR(result);
14435 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014436 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014437}
14438
Ethan Furmandf3ed242014-01-05 06:50:30 -080014439/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014440 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014441 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014442 * -1 and raise an exception on error */
14443static int
Victor Stinnera47082312012-10-04 02:19:54 +020014444mainformatlong(PyObject *v,
14445 struct unicode_format_arg_t *arg,
14446 PyObject **p_output,
14447 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014448{
14449 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014450 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014451
14452 if (!PyNumber_Check(v))
14453 goto wrongtype;
14454
Ethan Furman9ab74802014-03-21 06:38:46 -070014455 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014456 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014457 if (type == 'o' || type == 'x' || type == 'X') {
14458 iobj = PyNumber_Index(v);
14459 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014460 if (PyErr_ExceptionMatches(PyExc_TypeError))
14461 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014462 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014463 }
14464 }
14465 else {
14466 iobj = PyNumber_Long(v);
14467 if (iobj == NULL ) {
14468 if (PyErr_ExceptionMatches(PyExc_TypeError))
14469 goto wrongtype;
14470 return -1;
14471 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014472 }
14473 assert(PyLong_Check(iobj));
14474 }
14475 else {
14476 iobj = v;
14477 Py_INCREF(iobj);
14478 }
14479
14480 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014481 && arg->width == -1 && arg->prec == -1
14482 && !(arg->flags & (F_SIGN | F_BLANK))
14483 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014484 {
14485 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014486 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014487 int base;
14488
Victor Stinnera47082312012-10-04 02:19:54 +020014489 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014490 {
14491 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014492 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014493 case 'd':
14494 case 'i':
14495 case 'u':
14496 base = 10;
14497 break;
14498 case 'o':
14499 base = 8;
14500 break;
14501 case 'x':
14502 case 'X':
14503 base = 16;
14504 break;
14505 }
14506
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014507 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14508 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014509 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014510 }
14511 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014512 return 1;
14513 }
14514
Ethan Furmanb95b5612015-01-23 20:05:18 -080014515 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014516 Py_DECREF(iobj);
14517 if (res == NULL)
14518 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014519 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014520 return 0;
14521
14522wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014523 switch(type)
14524 {
14525 case 'o':
14526 case 'x':
14527 case 'X':
14528 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014529 "%%%c format: an integer is required, "
14530 "not %.200s",
14531 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014532 break;
14533 default:
14534 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014535 "%%%c format: a number is required, "
14536 "not %.200s",
14537 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014538 break;
14539 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014540 return -1;
14541}
14542
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014543static Py_UCS4
14544formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014545{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014546 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014547 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014548 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014549 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014550 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014551 goto onError;
14552 }
14553 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014554 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014555 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014556 /* make sure number is a type of integer */
14557 if (!PyLong_Check(v)) {
14558 iobj = PyNumber_Index(v);
14559 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014560 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014561 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014562 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014563 Py_DECREF(iobj);
14564 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014565 else {
14566 x = PyLong_AsLong(v);
14567 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014568 if (x == -1 && PyErr_Occurred())
14569 goto onError;
14570
Victor Stinner8faf8212011-12-08 22:14:11 +010014571 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014572 PyErr_SetString(PyExc_OverflowError,
14573 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014574 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014575 }
14576
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014577 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014578 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014579
Benjamin Peterson29060642009-01-31 22:14:21 +000014580 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014581 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014582 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014583 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014584}
14585
Victor Stinnera47082312012-10-04 02:19:54 +020014586/* Parse options of an argument: flags, width, precision.
14587 Handle also "%(name)" syntax.
14588
14589 Return 0 if the argument has been formatted into arg->str.
14590 Return 1 if the argument has been written into ctx->writer,
14591 Raise an exception and return -1 on error. */
14592static int
14593unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14594 struct unicode_format_arg_t *arg)
14595{
14596#define FORMAT_READ(ctx) \
14597 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14598
14599 PyObject *v;
14600
Victor Stinnera47082312012-10-04 02:19:54 +020014601 if (arg->ch == '(') {
14602 /* Get argument value from a dictionary. Example: "%(name)s". */
14603 Py_ssize_t keystart;
14604 Py_ssize_t keylen;
14605 PyObject *key;
14606 int pcount = 1;
14607
14608 if (ctx->dict == NULL) {
14609 PyErr_SetString(PyExc_TypeError,
14610 "format requires a mapping");
14611 return -1;
14612 }
14613 ++ctx->fmtpos;
14614 --ctx->fmtcnt;
14615 keystart = ctx->fmtpos;
14616 /* Skip over balanced parentheses */
14617 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14618 arg->ch = FORMAT_READ(ctx);
14619 if (arg->ch == ')')
14620 --pcount;
14621 else if (arg->ch == '(')
14622 ++pcount;
14623 ctx->fmtpos++;
14624 }
14625 keylen = ctx->fmtpos - keystart - 1;
14626 if (ctx->fmtcnt < 0 || pcount > 0) {
14627 PyErr_SetString(PyExc_ValueError,
14628 "incomplete format key");
14629 return -1;
14630 }
14631 key = PyUnicode_Substring(ctx->fmtstr,
14632 keystart, keystart + keylen);
14633 if (key == NULL)
14634 return -1;
14635 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014636 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014637 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014638 }
14639 ctx->args = PyObject_GetItem(ctx->dict, key);
14640 Py_DECREF(key);
14641 if (ctx->args == NULL)
14642 return -1;
14643 ctx->args_owned = 1;
14644 ctx->arglen = -1;
14645 ctx->argidx = -2;
14646 }
14647
14648 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014649 while (--ctx->fmtcnt >= 0) {
14650 arg->ch = FORMAT_READ(ctx);
14651 ctx->fmtpos++;
14652 switch (arg->ch) {
14653 case '-': arg->flags |= F_LJUST; continue;
14654 case '+': arg->flags |= F_SIGN; continue;
14655 case ' ': arg->flags |= F_BLANK; continue;
14656 case '#': arg->flags |= F_ALT; continue;
14657 case '0': arg->flags |= F_ZERO; continue;
14658 }
14659 break;
14660 }
14661
14662 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014663 if (arg->ch == '*') {
14664 v = unicode_format_getnextarg(ctx);
14665 if (v == NULL)
14666 return -1;
14667 if (!PyLong_Check(v)) {
14668 PyErr_SetString(PyExc_TypeError,
14669 "* wants int");
14670 return -1;
14671 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014672 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014673 if (arg->width == -1 && PyErr_Occurred())
14674 return -1;
14675 if (arg->width < 0) {
14676 arg->flags |= F_LJUST;
14677 arg->width = -arg->width;
14678 }
14679 if (--ctx->fmtcnt >= 0) {
14680 arg->ch = FORMAT_READ(ctx);
14681 ctx->fmtpos++;
14682 }
14683 }
14684 else if (arg->ch >= '0' && arg->ch <= '9') {
14685 arg->width = arg->ch - '0';
14686 while (--ctx->fmtcnt >= 0) {
14687 arg->ch = FORMAT_READ(ctx);
14688 ctx->fmtpos++;
14689 if (arg->ch < '0' || arg->ch > '9')
14690 break;
14691 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14692 mixing signed and unsigned comparison. Since arg->ch is between
14693 '0' and '9', casting to int is safe. */
14694 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14695 PyErr_SetString(PyExc_ValueError,
14696 "width too big");
14697 return -1;
14698 }
14699 arg->width = arg->width*10 + (arg->ch - '0');
14700 }
14701 }
14702
14703 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014704 if (arg->ch == '.') {
14705 arg->prec = 0;
14706 if (--ctx->fmtcnt >= 0) {
14707 arg->ch = FORMAT_READ(ctx);
14708 ctx->fmtpos++;
14709 }
14710 if (arg->ch == '*') {
14711 v = unicode_format_getnextarg(ctx);
14712 if (v == NULL)
14713 return -1;
14714 if (!PyLong_Check(v)) {
14715 PyErr_SetString(PyExc_TypeError,
14716 "* wants int");
14717 return -1;
14718 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014719 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014720 if (arg->prec == -1 && PyErr_Occurred())
14721 return -1;
14722 if (arg->prec < 0)
14723 arg->prec = 0;
14724 if (--ctx->fmtcnt >= 0) {
14725 arg->ch = FORMAT_READ(ctx);
14726 ctx->fmtpos++;
14727 }
14728 }
14729 else if (arg->ch >= '0' && arg->ch <= '9') {
14730 arg->prec = arg->ch - '0';
14731 while (--ctx->fmtcnt >= 0) {
14732 arg->ch = FORMAT_READ(ctx);
14733 ctx->fmtpos++;
14734 if (arg->ch < '0' || arg->ch > '9')
14735 break;
14736 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14737 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014738 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014739 return -1;
14740 }
14741 arg->prec = arg->prec*10 + (arg->ch - '0');
14742 }
14743 }
14744 }
14745
14746 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14747 if (ctx->fmtcnt >= 0) {
14748 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14749 if (--ctx->fmtcnt >= 0) {
14750 arg->ch = FORMAT_READ(ctx);
14751 ctx->fmtpos++;
14752 }
14753 }
14754 }
14755 if (ctx->fmtcnt < 0) {
14756 PyErr_SetString(PyExc_ValueError,
14757 "incomplete format");
14758 return -1;
14759 }
14760 return 0;
14761
14762#undef FORMAT_READ
14763}
14764
14765/* Format one argument. Supported conversion specifiers:
14766
14767 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014768 - "i", "d", "u": int or float
14769 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014770 - "e", "E", "f", "F", "g", "G": float
14771 - "c": int or str (1 character)
14772
Victor Stinner8dbd4212012-12-04 09:30:24 +010014773 When possible, the output is written directly into the Unicode writer
14774 (ctx->writer). A string is created when padding is required.
14775
Victor Stinnera47082312012-10-04 02:19:54 +020014776 Return 0 if the argument has been formatted into *p_str,
14777 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014778 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014779static int
14780unicode_format_arg_format(struct unicode_formatter_t *ctx,
14781 struct unicode_format_arg_t *arg,
14782 PyObject **p_str)
14783{
14784 PyObject *v;
14785 _PyUnicodeWriter *writer = &ctx->writer;
14786
14787 if (ctx->fmtcnt == 0)
14788 ctx->writer.overallocate = 0;
14789
Victor Stinnera47082312012-10-04 02:19:54 +020014790 v = unicode_format_getnextarg(ctx);
14791 if (v == NULL)
14792 return -1;
14793
Victor Stinnera47082312012-10-04 02:19:54 +020014794
14795 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014796 case 's':
14797 case 'r':
14798 case 'a':
14799 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14800 /* Fast path */
14801 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14802 return -1;
14803 return 1;
14804 }
14805
14806 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14807 *p_str = v;
14808 Py_INCREF(*p_str);
14809 }
14810 else {
14811 if (arg->ch == 's')
14812 *p_str = PyObject_Str(v);
14813 else if (arg->ch == 'r')
14814 *p_str = PyObject_Repr(v);
14815 else
14816 *p_str = PyObject_ASCII(v);
14817 }
14818 break;
14819
14820 case 'i':
14821 case 'd':
14822 case 'u':
14823 case 'o':
14824 case 'x':
14825 case 'X':
14826 {
14827 int ret = mainformatlong(v, arg, p_str, writer);
14828 if (ret != 0)
14829 return ret;
14830 arg->sign = 1;
14831 break;
14832 }
14833
14834 case 'e':
14835 case 'E':
14836 case 'f':
14837 case 'F':
14838 case 'g':
14839 case 'G':
14840 if (arg->width == -1 && arg->prec == -1
14841 && !(arg->flags & (F_SIGN | F_BLANK)))
14842 {
14843 /* Fast path */
14844 if (formatfloat(v, arg, NULL, writer) == -1)
14845 return -1;
14846 return 1;
14847 }
14848
14849 arg->sign = 1;
14850 if (formatfloat(v, arg, p_str, NULL) == -1)
14851 return -1;
14852 break;
14853
14854 case 'c':
14855 {
14856 Py_UCS4 ch = formatchar(v);
14857 if (ch == (Py_UCS4) -1)
14858 return -1;
14859 if (arg->width == -1 && arg->prec == -1) {
14860 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014861 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014862 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014863 return 1;
14864 }
14865 *p_str = PyUnicode_FromOrdinal(ch);
14866 break;
14867 }
14868
14869 default:
14870 PyErr_Format(PyExc_ValueError,
14871 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014872 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014873 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14874 (int)arg->ch,
14875 ctx->fmtpos - 1);
14876 return -1;
14877 }
14878 if (*p_str == NULL)
14879 return -1;
14880 assert (PyUnicode_Check(*p_str));
14881 return 0;
14882}
14883
14884static int
14885unicode_format_arg_output(struct unicode_formatter_t *ctx,
14886 struct unicode_format_arg_t *arg,
14887 PyObject *str)
14888{
14889 Py_ssize_t len;
14890 enum PyUnicode_Kind kind;
14891 void *pbuf;
14892 Py_ssize_t pindex;
14893 Py_UCS4 signchar;
14894 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014895 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014896 Py_ssize_t sublen;
14897 _PyUnicodeWriter *writer = &ctx->writer;
14898 Py_UCS4 fill;
14899
14900 fill = ' ';
14901 if (arg->sign && arg->flags & F_ZERO)
14902 fill = '0';
14903
14904 if (PyUnicode_READY(str) == -1)
14905 return -1;
14906
14907 len = PyUnicode_GET_LENGTH(str);
14908 if ((arg->width == -1 || arg->width <= len)
14909 && (arg->prec == -1 || arg->prec >= len)
14910 && !(arg->flags & (F_SIGN | F_BLANK)))
14911 {
14912 /* Fast path */
14913 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14914 return -1;
14915 return 0;
14916 }
14917
14918 /* Truncate the string for "s", "r" and "a" formats
14919 if the precision is set */
14920 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14921 if (arg->prec >= 0 && len > arg->prec)
14922 len = arg->prec;
14923 }
14924
14925 /* Adjust sign and width */
14926 kind = PyUnicode_KIND(str);
14927 pbuf = PyUnicode_DATA(str);
14928 pindex = 0;
14929 signchar = '\0';
14930 if (arg->sign) {
14931 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14932 if (ch == '-' || ch == '+') {
14933 signchar = ch;
14934 len--;
14935 pindex++;
14936 }
14937 else if (arg->flags & F_SIGN)
14938 signchar = '+';
14939 else if (arg->flags & F_BLANK)
14940 signchar = ' ';
14941 else
14942 arg->sign = 0;
14943 }
14944 if (arg->width < len)
14945 arg->width = len;
14946
14947 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014948 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014949 if (!(arg->flags & F_LJUST)) {
14950 if (arg->sign) {
14951 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014952 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014953 }
14954 else {
14955 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014956 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014957 }
14958 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014959 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14960 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014961 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014962 }
14963
Victor Stinnera47082312012-10-04 02:19:54 +020014964 buflen = arg->width;
14965 if (arg->sign && len == arg->width)
14966 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014967 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014968 return -1;
14969
14970 /* Write the sign if needed */
14971 if (arg->sign) {
14972 if (fill != ' ') {
14973 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14974 writer->pos += 1;
14975 }
14976 if (arg->width > len)
14977 arg->width--;
14978 }
14979
14980 /* Write the numeric prefix for "x", "X" and "o" formats
14981 if the alternate form is used.
14982 For example, write "0x" for the "%#x" format. */
14983 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14984 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14985 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14986 if (fill != ' ') {
14987 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14988 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14989 writer->pos += 2;
14990 pindex += 2;
14991 }
14992 arg->width -= 2;
14993 if (arg->width < 0)
14994 arg->width = 0;
14995 len -= 2;
14996 }
14997
14998 /* Pad left with the fill character if needed */
14999 if (arg->width > len && !(arg->flags & F_LJUST)) {
15000 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015001 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015002 writer->pos += sublen;
15003 arg->width = len;
15004 }
15005
15006 /* If padding with spaces: write sign if needed and/or numeric prefix if
15007 the alternate form is used */
15008 if (fill == ' ') {
15009 if (arg->sign) {
15010 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15011 writer->pos += 1;
15012 }
15013 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15014 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15015 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15016 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15017 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15018 writer->pos += 2;
15019 pindex += 2;
15020 }
15021 }
15022
15023 /* Write characters */
15024 if (len) {
15025 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15026 str, pindex, len);
15027 writer->pos += len;
15028 }
15029
15030 /* Pad right with the fill character if needed */
15031 if (arg->width > len) {
15032 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015033 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015034 writer->pos += sublen;
15035 }
15036 return 0;
15037}
15038
15039/* Helper of PyUnicode_Format(): format one arg.
15040 Return 0 on success, raise an exception and return -1 on error. */
15041static int
15042unicode_format_arg(struct unicode_formatter_t *ctx)
15043{
15044 struct unicode_format_arg_t arg;
15045 PyObject *str;
15046 int ret;
15047
Victor Stinner8dbd4212012-12-04 09:30:24 +010015048 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015049 if (arg.ch == '%') {
15050 ctx->fmtpos++;
15051 ctx->fmtcnt--;
15052 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15053 return -1;
15054 return 0;
15055 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015056 arg.flags = 0;
15057 arg.width = -1;
15058 arg.prec = -1;
15059 arg.sign = 0;
15060 str = NULL;
15061
Victor Stinnera47082312012-10-04 02:19:54 +020015062 ret = unicode_format_arg_parse(ctx, &arg);
15063 if (ret == -1)
15064 return -1;
15065
15066 ret = unicode_format_arg_format(ctx, &arg, &str);
15067 if (ret == -1)
15068 return -1;
15069
15070 if (ret != 1) {
15071 ret = unicode_format_arg_output(ctx, &arg, str);
15072 Py_DECREF(str);
15073 if (ret == -1)
15074 return -1;
15075 }
15076
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015077 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015078 PyErr_SetString(PyExc_TypeError,
15079 "not all arguments converted during string formatting");
15080 return -1;
15081 }
15082 return 0;
15083}
15084
Alexander Belopolsky40018472011-02-26 01:02:56 +000015085PyObject *
15086PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015087{
Victor Stinnera47082312012-10-04 02:19:54 +020015088 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015089
Guido van Rossumd57fd912000-03-10 22:53:23 +000015090 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015091 PyErr_BadInternalCall();
15092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015093 }
Victor Stinnera47082312012-10-04 02:19:54 +020015094
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015095 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015096 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015097
15098 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015099 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15100 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15101 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15102 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015103
Victor Stinner8f674cc2013-04-17 23:02:17 +020015104 _PyUnicodeWriter_Init(&ctx.writer);
15105 ctx.writer.min_length = ctx.fmtcnt + 100;
15106 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015107
Guido van Rossumd57fd912000-03-10 22:53:23 +000015108 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015109 ctx.arglen = PyTuple_Size(args);
15110 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015111 }
15112 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015113 ctx.arglen = -1;
15114 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015115 }
Victor Stinnera47082312012-10-04 02:19:54 +020015116 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015117 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015118 ctx.dict = args;
15119 else
15120 ctx.dict = NULL;
15121 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015122
Victor Stinnera47082312012-10-04 02:19:54 +020015123 while (--ctx.fmtcnt >= 0) {
15124 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015125 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015126
15127 nonfmtpos = ctx.fmtpos++;
15128 while (ctx.fmtcnt >= 0 &&
15129 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15130 ctx.fmtpos++;
15131 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015132 }
Victor Stinnera47082312012-10-04 02:19:54 +020015133 if (ctx.fmtcnt < 0) {
15134 ctx.fmtpos--;
15135 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015136 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015137
Victor Stinnercfc4c132013-04-03 01:48:39 +020015138 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15139 nonfmtpos, ctx.fmtpos) < 0)
15140 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 }
15142 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015143 ctx.fmtpos++;
15144 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015145 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015146 }
15147 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015148
Victor Stinnera47082312012-10-04 02:19:54 +020015149 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015150 PyErr_SetString(PyExc_TypeError,
15151 "not all arguments converted during string formatting");
15152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015153 }
15154
Victor Stinnera47082312012-10-04 02:19:54 +020015155 if (ctx.args_owned) {
15156 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015157 }
Victor Stinnera47082312012-10-04 02:19:54 +020015158 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015159
Benjamin Peterson29060642009-01-31 22:14:21 +000015160 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015161 _PyUnicodeWriter_Dealloc(&ctx.writer);
15162 if (ctx.args_owned) {
15163 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015164 }
15165 return NULL;
15166}
15167
Jeremy Hylton938ace62002-07-17 16:30:39 +000015168static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015169unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15170
Tim Peters6d6c1a32001-08-02 04:15:00 +000015171static PyObject *
15172unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15173{
Benjamin Peterson29060642009-01-31 22:14:21 +000015174 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015175 static char *kwlist[] = {"object", "encoding", "errors", 0};
15176 char *encoding = NULL;
15177 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015178
Benjamin Peterson14339b62009-01-31 16:36:08 +000015179 if (type != &PyUnicode_Type)
15180 return unicode_subtype_new(type, args, kwds);
15181 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015182 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015183 return NULL;
15184 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015185 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 if (encoding == NULL && errors == NULL)
15187 return PyObject_Str(x);
15188 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015189 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015190}
15191
Guido van Rossume023fe02001-08-30 03:12:59 +000015192static PyObject *
15193unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15194{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015195 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015196 Py_ssize_t length, char_size;
15197 int share_wstr, share_utf8;
15198 unsigned int kind;
15199 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015200
Benjamin Peterson14339b62009-01-31 16:36:08 +000015201 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015202
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015203 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015204 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015205 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015206 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015207 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015208 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015209 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015210 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015211
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015212 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015213 if (self == NULL) {
15214 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015215 return NULL;
15216 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015217 kind = PyUnicode_KIND(unicode);
15218 length = PyUnicode_GET_LENGTH(unicode);
15219
15220 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015221#ifdef Py_DEBUG
15222 _PyUnicode_HASH(self) = -1;
15223#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015224 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015225#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015226 _PyUnicode_STATE(self).interned = 0;
15227 _PyUnicode_STATE(self).kind = kind;
15228 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015229 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015230 _PyUnicode_STATE(self).ready = 1;
15231 _PyUnicode_WSTR(self) = NULL;
15232 _PyUnicode_UTF8_LENGTH(self) = 0;
15233 _PyUnicode_UTF8(self) = NULL;
15234 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015235 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015236
15237 share_utf8 = 0;
15238 share_wstr = 0;
15239 if (kind == PyUnicode_1BYTE_KIND) {
15240 char_size = 1;
15241 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15242 share_utf8 = 1;
15243 }
15244 else if (kind == PyUnicode_2BYTE_KIND) {
15245 char_size = 2;
15246 if (sizeof(wchar_t) == 2)
15247 share_wstr = 1;
15248 }
15249 else {
15250 assert(kind == PyUnicode_4BYTE_KIND);
15251 char_size = 4;
15252 if (sizeof(wchar_t) == 4)
15253 share_wstr = 1;
15254 }
15255
15256 /* Ensure we won't overflow the length. */
15257 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15258 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015259 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015260 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015261 data = PyObject_MALLOC((length + 1) * char_size);
15262 if (data == NULL) {
15263 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015264 goto onError;
15265 }
15266
Victor Stinnerc3c74152011-10-02 20:39:55 +020015267 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015268 if (share_utf8) {
15269 _PyUnicode_UTF8_LENGTH(self) = length;
15270 _PyUnicode_UTF8(self) = data;
15271 }
15272 if (share_wstr) {
15273 _PyUnicode_WSTR_LENGTH(self) = length;
15274 _PyUnicode_WSTR(self) = (wchar_t *)data;
15275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015276
Christian Heimesf051e432016-09-13 20:22:02 +020015277 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015278 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015279 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015280#ifdef Py_DEBUG
15281 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15282#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015283 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015284 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015285
15286onError:
15287 Py_DECREF(unicode);
15288 Py_DECREF(self);
15289 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015290}
15291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015292PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015293"str(object='') -> str\n\
15294str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015295\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015296Create a new string object from the given object. If encoding or\n\
15297errors is specified, then the object must expose a data buffer\n\
15298that will be decoded using the given encoding and error handler.\n\
15299Otherwise, returns the result of object.__str__() (if defined)\n\
15300or repr(object).\n\
15301encoding defaults to sys.getdefaultencoding().\n\
15302errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015303
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015304static PyObject *unicode_iter(PyObject *seq);
15305
Guido van Rossumd57fd912000-03-10 22:53:23 +000015306PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015307 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015308 "str", /* tp_name */
15309 sizeof(PyUnicodeObject), /* tp_basicsize */
15310 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015311 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015312 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015313 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015314 0, /* tp_getattr */
15315 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015316 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015317 unicode_repr, /* tp_repr */
15318 &unicode_as_number, /* tp_as_number */
15319 &unicode_as_sequence, /* tp_as_sequence */
15320 &unicode_as_mapping, /* tp_as_mapping */
15321 (hashfunc) unicode_hash, /* tp_hash*/
15322 0, /* tp_call*/
15323 (reprfunc) unicode_str, /* tp_str */
15324 PyObject_GenericGetAttr, /* tp_getattro */
15325 0, /* tp_setattro */
15326 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015328 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15329 unicode_doc, /* tp_doc */
15330 0, /* tp_traverse */
15331 0, /* tp_clear */
15332 PyUnicode_RichCompare, /* tp_richcompare */
15333 0, /* tp_weaklistoffset */
15334 unicode_iter, /* tp_iter */
15335 0, /* tp_iternext */
15336 unicode_methods, /* tp_methods */
15337 0, /* tp_members */
15338 0, /* tp_getset */
15339 &PyBaseObject_Type, /* tp_base */
15340 0, /* tp_dict */
15341 0, /* tp_descr_get */
15342 0, /* tp_descr_set */
15343 0, /* tp_dictoffset */
15344 0, /* tp_init */
15345 0, /* tp_alloc */
15346 unicode_new, /* tp_new */
15347 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015348};
15349
15350/* Initialize the Unicode implementation */
15351
Victor Stinner331a6a52019-05-27 16:39:22 +020015352PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015353_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015354{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015355 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015356 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015357 0x000A, /* LINE FEED */
15358 0x000D, /* CARRIAGE RETURN */
15359 0x001C, /* FILE SEPARATOR */
15360 0x001D, /* GROUP SEPARATOR */
15361 0x001E, /* RECORD SEPARATOR */
15362 0x0085, /* NEXT LINE */
15363 0x2028, /* LINE SEPARATOR */
15364 0x2029, /* PARAGRAPH SEPARATOR */
15365 };
15366
Fred Drakee4315f52000-05-09 19:53:39 +000015367 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015368 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015369 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015370 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015371 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015372 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015373
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015374 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015375 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015376 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015377
15378 /* initialize the linebreak bloom filter */
15379 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015380 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015381 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015382
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015383 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015384 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015385 }
15386 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015387 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015388 }
15389 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015390 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015391 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015392 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015393}
15394
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015395
Walter Dörwald16807132007-05-25 13:52:07 +000015396void
15397PyUnicode_InternInPlace(PyObject **p)
15398{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015399 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015401#ifdef Py_DEBUG
15402 assert(s != NULL);
15403 assert(_PyUnicode_CHECK(s));
15404#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015405 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015406 return;
15407#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 /* If it's a subclass, we don't really know what putting
15409 it in the interned dict might do. */
15410 if (!PyUnicode_CheckExact(s))
15411 return;
15412 if (PyUnicode_CHECK_INTERNED(s))
15413 return;
15414 if (interned == NULL) {
15415 interned = PyDict_New();
15416 if (interned == NULL) {
15417 PyErr_Clear(); /* Don't leave an exception */
15418 return;
15419 }
15420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015421 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015422 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015424 if (t == NULL) {
15425 PyErr_Clear();
15426 return;
15427 }
15428 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015429 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015430 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015431 return;
15432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 /* The two references in interned are not counted by refcnt.
15434 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015435 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015436 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015437}
15438
15439void
15440PyUnicode_InternImmortal(PyObject **p)
15441{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015442 PyUnicode_InternInPlace(p);
15443 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015444 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015445 Py_INCREF(*p);
15446 }
Walter Dörwald16807132007-05-25 13:52:07 +000015447}
15448
15449PyObject *
15450PyUnicode_InternFromString(const char *cp)
15451{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015452 PyObject *s = PyUnicode_FromString(cp);
15453 if (s == NULL)
15454 return NULL;
15455 PyUnicode_InternInPlace(&s);
15456 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015457}
15458
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015459
15460#if defined(WITH_VALGRIND) || defined(__INSURE__)
15461static void
15462unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015463{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015464 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015466 }
15467 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 if (keys == NULL || !PyList_Check(keys)) {
15469 PyErr_Clear();
15470 return;
15471 }
Walter Dörwald16807132007-05-25 13:52:07 +000015472
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015473 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015474 detector, interned unicode strings are not forcibly deallocated;
15475 rather, we give them their stolen references back, and then clear
15476 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015477
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015478 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015479#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015480 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015481 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015482
15483 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015484#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015485 for (Py_ssize_t i = 0; i < n; i++) {
15486 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015487 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015488 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015490 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015491 case SSTATE_INTERNED_IMMORTAL:
15492 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015493#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015494 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015495#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 break;
15497 case SSTATE_INTERNED_MORTAL:
15498 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015499#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015500 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015501#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015503 case SSTATE_NOT_INTERNED:
15504 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015506 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015508 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015509 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015510#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 fprintf(stderr, "total size of all interned strings: "
15512 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15513 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015514#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015515 Py_DECREF(keys);
15516 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015517 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015518}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015519#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015520
15521
15522/********************* Unicode Iterator **************************/
15523
15524typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 PyObject_HEAD
15526 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015527 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015528} unicodeiterobject;
15529
15530static void
15531unicodeiter_dealloc(unicodeiterobject *it)
15532{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015533 _PyObject_GC_UNTRACK(it);
15534 Py_XDECREF(it->it_seq);
15535 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015536}
15537
15538static int
15539unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15540{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015541 Py_VISIT(it->it_seq);
15542 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015543}
15544
15545static PyObject *
15546unicodeiter_next(unicodeiterobject *it)
15547{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015548 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015549
Benjamin Peterson14339b62009-01-31 16:36:08 +000015550 assert(it != NULL);
15551 seq = it->it_seq;
15552 if (seq == NULL)
15553 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015554 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015556 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15557 int kind = PyUnicode_KIND(seq);
15558 void *data = PyUnicode_DATA(seq);
15559 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15560 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015561 if (item != NULL)
15562 ++it->it_index;
15563 return item;
15564 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015565
Benjamin Peterson14339b62009-01-31 16:36:08 +000015566 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015567 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015568 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015569}
15570
15571static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015572unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015573{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015574 Py_ssize_t len = 0;
15575 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015576 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015577 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015578}
15579
15580PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15581
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015582static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015583unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015584{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015585 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015586 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015587 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015588 it->it_seq, it->it_index);
15589 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015590 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015591 if (u == NULL)
15592 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015593 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015594 }
15595}
15596
15597PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15598
15599static PyObject *
15600unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15601{
15602 Py_ssize_t index = PyLong_AsSsize_t(state);
15603 if (index == -1 && PyErr_Occurred())
15604 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015605 if (it->it_seq != NULL) {
15606 if (index < 0)
15607 index = 0;
15608 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15609 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15610 it->it_index = index;
15611 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015612 Py_RETURN_NONE;
15613}
15614
15615PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15616
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015617static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015618 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015619 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015620 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15621 reduce_doc},
15622 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15623 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015624 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015625};
15626
15627PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015628 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15629 "str_iterator", /* tp_name */
15630 sizeof(unicodeiterobject), /* tp_basicsize */
15631 0, /* tp_itemsize */
15632 /* methods */
15633 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015634 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015635 0, /* tp_getattr */
15636 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015637 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015638 0, /* tp_repr */
15639 0, /* tp_as_number */
15640 0, /* tp_as_sequence */
15641 0, /* tp_as_mapping */
15642 0, /* tp_hash */
15643 0, /* tp_call */
15644 0, /* tp_str */
15645 PyObject_GenericGetAttr, /* tp_getattro */
15646 0, /* tp_setattro */
15647 0, /* tp_as_buffer */
15648 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15649 0, /* tp_doc */
15650 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15651 0, /* tp_clear */
15652 0, /* tp_richcompare */
15653 0, /* tp_weaklistoffset */
15654 PyObject_SelfIter, /* tp_iter */
15655 (iternextfunc)unicodeiter_next, /* tp_iternext */
15656 unicodeiter_methods, /* tp_methods */
15657 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015658};
15659
15660static PyObject *
15661unicode_iter(PyObject *seq)
15662{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015663 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015664
Benjamin Peterson14339b62009-01-31 16:36:08 +000015665 if (!PyUnicode_Check(seq)) {
15666 PyErr_BadInternalCall();
15667 return NULL;
15668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015669 if (PyUnicode_READY(seq) == -1)
15670 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015671 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15672 if (it == NULL)
15673 return NULL;
15674 it->it_index = 0;
15675 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015676 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015677 _PyObject_GC_TRACK(it);
15678 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015679}
15680
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015681
15682size_t
15683Py_UNICODE_strlen(const Py_UNICODE *u)
15684{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015685 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015686}
15687
15688Py_UNICODE*
15689Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15690{
15691 Py_UNICODE *u = s1;
15692 while ((*u++ = *s2++));
15693 return s1;
15694}
15695
15696Py_UNICODE*
15697Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15698{
15699 Py_UNICODE *u = s1;
15700 while ((*u++ = *s2++))
15701 if (n-- == 0)
15702 break;
15703 return s1;
15704}
15705
15706Py_UNICODE*
15707Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15708{
15709 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015710 u1 += wcslen(u1);
15711 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015712 return s1;
15713}
15714
15715int
15716Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15717{
15718 while (*s1 && *s2 && *s1 == *s2)
15719 s1++, s2++;
15720 if (*s1 && *s2)
15721 return (*s1 < *s2) ? -1 : +1;
15722 if (*s1)
15723 return 1;
15724 if (*s2)
15725 return -1;
15726 return 0;
15727}
15728
15729int
15730Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15731{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015732 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015733 for (; n != 0; n--) {
15734 u1 = *s1;
15735 u2 = *s2;
15736 if (u1 != u2)
15737 return (u1 < u2) ? -1 : +1;
15738 if (u1 == '\0')
15739 return 0;
15740 s1++;
15741 s2++;
15742 }
15743 return 0;
15744}
15745
15746Py_UNICODE*
15747Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15748{
15749 const Py_UNICODE *p;
15750 for (p = s; *p; p++)
15751 if (*p == c)
15752 return (Py_UNICODE*)p;
15753 return NULL;
15754}
15755
15756Py_UNICODE*
15757Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15758{
15759 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015760 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015761 while (p != s) {
15762 p--;
15763 if (*p == c)
15764 return (Py_UNICODE*)p;
15765 }
15766 return NULL;
15767}
Victor Stinner331ea922010-08-10 16:37:20 +000015768
Victor Stinner71133ff2010-09-01 23:43:53 +000015769Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015770PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015771{
Victor Stinner577db2c2011-10-11 22:12:48 +020015772 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015773 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015775 if (!PyUnicode_Check(unicode)) {
15776 PyErr_BadArgument();
15777 return NULL;
15778 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015779 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015780 if (u == NULL)
15781 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015782 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015783 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015784 PyErr_NoMemory();
15785 return NULL;
15786 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015787 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015788 size *= sizeof(Py_UNICODE);
15789 copy = PyMem_Malloc(size);
15790 if (copy == NULL) {
15791 PyErr_NoMemory();
15792 return NULL;
15793 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015794 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015795 return copy;
15796}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015797
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015798
Victor Stinner709d23d2019-05-02 14:56:30 -040015799static int
15800encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015801{
Victor Stinner709d23d2019-05-02 14:56:30 -040015802 int res;
15803 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15804 if (res == -2) {
15805 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15806 return -1;
15807 }
15808 if (res < 0) {
15809 PyErr_NoMemory();
15810 return -1;
15811 }
15812 return 0;
15813}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015814
Victor Stinner709d23d2019-05-02 14:56:30 -040015815
15816static int
15817config_get_codec_name(wchar_t **config_encoding)
15818{
15819 char *encoding;
15820 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15821 return -1;
15822 }
15823
15824 PyObject *name_obj = NULL;
15825 PyObject *codec = _PyCodec_Lookup(encoding);
15826 PyMem_RawFree(encoding);
15827
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015828 if (!codec)
15829 goto error;
15830
15831 name_obj = PyObject_GetAttrString(codec, "name");
15832 Py_CLEAR(codec);
15833 if (!name_obj) {
15834 goto error;
15835 }
15836
Victor Stinner709d23d2019-05-02 14:56:30 -040015837 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15838 Py_DECREF(name_obj);
15839 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015840 goto error;
15841 }
15842
Victor Stinner709d23d2019-05-02 14:56:30 -040015843 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15844 if (raw_wname == NULL) {
15845 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015846 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015847 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015848 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015849
15850 PyMem_RawFree(*config_encoding);
15851 *config_encoding = raw_wname;
15852
15853 PyMem_Free(wname);
15854 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015855
15856error:
15857 Py_XDECREF(codec);
15858 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015859 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015860}
15861
15862
Victor Stinner331a6a52019-05-27 16:39:22 +020015863static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015864init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015865{
Victor Stinner709d23d2019-05-02 14:56:30 -040015866 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015867 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015868 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015869 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015870 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015871 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015872 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015873}
15874
15875
Victor Stinner709d23d2019-05-02 14:56:30 -040015876static int
15877init_fs_codec(PyInterpreterState *interp)
15878{
Victor Stinner331a6a52019-05-27 16:39:22 +020015879 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015880
15881 _Py_error_handler error_handler;
15882 error_handler = get_error_handler_wide(config->filesystem_errors);
15883 if (error_handler == _Py_ERROR_UNKNOWN) {
15884 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15885 return -1;
15886 }
15887
15888 char *encoding, *errors;
15889 if (encode_wstr_utf8(config->filesystem_encoding,
15890 &encoding,
15891 "filesystem_encoding") < 0) {
15892 return -1;
15893 }
15894
15895 if (encode_wstr_utf8(config->filesystem_errors,
15896 &errors,
15897 "filesystem_errors") < 0) {
15898 PyMem_RawFree(encoding);
15899 return -1;
15900 }
15901
15902 PyMem_RawFree(interp->fs_codec.encoding);
15903 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015904 /* encoding has been normalized by init_fs_encoding() */
15905 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015906 PyMem_RawFree(interp->fs_codec.errors);
15907 interp->fs_codec.errors = errors;
15908 interp->fs_codec.error_handler = error_handler;
15909
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015910#ifdef _Py_FORCE_UTF8_FS_ENCODING
15911 assert(interp->fs_codec.utf8 == 1);
15912#endif
15913
Victor Stinner709d23d2019-05-02 14:56:30 -040015914 /* At this point, PyUnicode_EncodeFSDefault() and
15915 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15916 the C implementation of the filesystem encoding. */
15917
15918 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15919 global configuration variables. */
15920 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15921 interp->fs_codec.errors) < 0) {
15922 PyErr_NoMemory();
15923 return -1;
15924 }
15925 return 0;
15926}
15927
15928
Victor Stinner331a6a52019-05-27 16:39:22 +020015929static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015930init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015931{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015932 PyInterpreterState *interp = tstate->interp;
15933
Victor Stinner709d23d2019-05-02 14:56:30 -040015934 /* Update the filesystem encoding to the normalized Python codec name.
15935 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15936 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015937 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015938 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015939 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015940 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015941 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015942 }
15943
Victor Stinner709d23d2019-05-02 14:56:30 -040015944 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015945 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015946 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015947 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015948}
15949
15950
Victor Stinner331a6a52019-05-27 16:39:22 +020015951PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015952_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015953{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015954 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015955 if (_PyStatus_EXCEPTION(status)) {
15956 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015957 }
15958
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015959 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015960}
15961
15962
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015963static void
15964_PyUnicode_FiniEncodings(PyThreadState *tstate)
15965{
15966 PyInterpreterState *interp = tstate->interp;
15967 PyMem_RawFree(interp->fs_codec.encoding);
15968 interp->fs_codec.encoding = NULL;
15969 interp->fs_codec.utf8 = 0;
15970 PyMem_RawFree(interp->fs_codec.errors);
15971 interp->fs_codec.errors = NULL;
15972 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
15973}
15974
15975
Victor Stinner709d23d2019-05-02 14:56:30 -040015976#ifdef MS_WINDOWS
15977int
15978_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15979{
15980 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015981 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015982
15983 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15984 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15985 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15986 if (encoding == NULL || errors == NULL) {
15987 PyMem_RawFree(encoding);
15988 PyMem_RawFree(errors);
15989 PyErr_NoMemory();
15990 return -1;
15991 }
15992
15993 PyMem_RawFree(config->filesystem_encoding);
15994 config->filesystem_encoding = encoding;
15995 PyMem_RawFree(config->filesystem_errors);
15996 config->filesystem_errors = errors;
15997
15998 return init_fs_codec(interp);
15999}
16000#endif
16001
16002
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016003void
Victor Stinner3d483342019-11-22 12:27:50 +010016004_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016005{
Victor Stinner3d483342019-11-22 12:27:50 +010016006 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016007#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016008 /* Insure++ is a memory analysis tool that aids in discovering
16009 * memory leaks and other memory problems. On Python exit, the
16010 * interned string dictionaries are flagged as being in use at exit
16011 * (which it is). Under normal circumstances, this is fine because
16012 * the memory will be automatically reclaimed by the system. Under
16013 * memory debugging, it's a huge source of useless noise, so we
16014 * trade off slower shutdown for less distraction in the memory
16015 * reports. -baw
16016 */
16017 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016018#endif /* __INSURE__ */
16019
Victor Stinner3d483342019-11-22 12:27:50 +010016020 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016021
Victor Stinner3d483342019-11-22 12:27:50 +010016022 for (Py_ssize_t i = 0; i < 256; i++) {
16023 Py_CLEAR(unicode_latin1[i]);
16024 }
16025 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016026 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016027
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016028 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016029}
16030
16031
Georg Brandl66c221e2010-10-14 07:04:07 +000016032/* A _string module, to export formatter_parser and formatter_field_name_split
16033 to the string.Formatter class implemented in Python. */
16034
16035static PyMethodDef _string_methods[] = {
16036 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16037 METH_O, PyDoc_STR("split the argument as a field name")},
16038 {"formatter_parser", (PyCFunction) formatter_parser,
16039 METH_O, PyDoc_STR("parse the argument as a format string")},
16040 {NULL, NULL}
16041};
16042
16043static struct PyModuleDef _string_module = {
16044 PyModuleDef_HEAD_INIT,
16045 "_string",
16046 PyDoc_STR("string helper module"),
16047 0,
16048 _string_methods,
16049 NULL,
16050 NULL,
16051 NULL,
16052 NULL
16053};
16054
16055PyMODINIT_FUNC
16056PyInit__string(void)
16057{
16058 return PyModule_Create(&_string_module);
16059}
16060
16061
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016062#ifdef __cplusplus
16063}
16064#endif