blob: 9d51c8a685ebed23d75e94b0f86d328658af24c0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner45876a92020-02-12 22:32:34 +010043#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020045#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010046#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020047#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040048#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010049#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000050#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600175 const from_type *_iter = (const from_type *)(begin);\
176 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
454 if (encoding != NULL) {
455 PyObject *handler = _PyCodec_Lookup(encoding);
456 if (handler == NULL) {
457 return -1;
458 }
459 Py_DECREF(handler);
460 }
461
462 if (errors != NULL) {
463 PyObject *handler = PyCodec_LookupError(errors);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469 return 0;
470}
471
472
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300473/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
474 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000476PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000477{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000478#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000479 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000480#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 /* This is actually an illegal character, so it should
482 not be passed to unichr. */
483 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484#endif
485}
486
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200487int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100488_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200489{
Victor Stinner68762572019-10-07 18:42:01 +0200490#define CHECK(expr) \
491 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
492
Victor Stinner910337b2011-10-03 03:20:16 +0200493 PyASCIIObject *ascii;
494 unsigned int kind;
495
Victor Stinner68762572019-10-07 18:42:01 +0200496 assert(op != NULL);
497 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200498
499 ascii = (PyASCIIObject *)op;
500 kind = ascii->state.kind;
501
Victor Stinnera3b334d2011-10-03 13:53:37 +0200502 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200503 CHECK(kind == PyUnicode_1BYTE_KIND);
504 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200505 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200507 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200508 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200509
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 if (ascii->state.compact == 1) {
511 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 || kind == PyUnicode_2BYTE_KIND
514 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(ascii->state.ascii == 0);
516 CHECK(ascii->state.ready == 1);
517 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100518 }
519 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
523 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(ascii->length == 0);
525 CHECK(ascii->hash == -1);
526 CHECK(ascii->state.compact == 0);
527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 0);
529 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
530 CHECK(ascii->wstr != NULL);
531 CHECK(data == NULL);
532 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 }
534 else {
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 == data);
543 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 else
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 }
549 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200550 if (
551#if SIZEOF_WCHAR_T == 2
552 kind == PyUnicode_2BYTE_KIND
553#else
554 kind == PyUnicode_4BYTE_KIND
555#endif
556 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr == data);
559 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 } else
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200562 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200563
564 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200569
570 /* check that the best kind is used: O(n) operation */
571 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200572 Py_ssize_t i;
573 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200574 void *data;
575 Py_UCS4 ch;
576
577 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 for (i=0; i < ascii->length; i++)
579 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200581 if (ch > maxchar)
582 maxchar = ch;
583 }
584 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100585 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar >= 128);
587 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 }
Victor Stinner77faf692011-11-20 18:56:05 +0100592 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x100);
594 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
596 else {
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar >= 0x10000);
598 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100599 }
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200603
604#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400605}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
Victor Stinner910337b2011-10-03 03:20:16 +0200607
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608static PyObject*
609unicode_result_wchar(PyObject *unicode)
610{
611#ifndef Py_DEBUG
612 Py_ssize_t len;
613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 len = _PyUnicode_WSTR_LENGTH(unicode);
615 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100618 }
619
620 if (len == 1) {
621 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100622 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
624 Py_DECREF(unicode);
625 return latin1_char;
626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
650 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 }
654 return unicode_empty;
655 }
656
657 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200658 void *data = PyUnicode_DATA(unicode);
659 int kind = PyUnicode_KIND(unicode);
660 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (ch < 256) {
662 PyObject *latin1_char = unicode_latin1[ch];
663 if (latin1_char != NULL) {
664 if (unicode != latin1_char) {
665 Py_INCREF(latin1_char);
666 Py_DECREF(unicode);
667 }
668 return latin1_char;
669 }
670 else {
671 assert(_PyUnicode_CheckConsistency(unicode, 1));
672 Py_INCREF(unicode);
673 unicode_latin1[ch] = unicode;
674 return unicode;
675 }
676 }
677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
716 void *data;
717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
783 void *data;
784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
825 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 }
827 return str;
828}
829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830/* --- Bloom Filters ----------------------------------------------------- */
831
832/* stuff to implement simple "bloom filters" for Unicode characters.
833 to keep things simple, we use a single bitmask, using the least 5
834 bits from each unicode characters as the bit index. */
835
836/* the linebreak mask is set up by Unicode_Init below */
837
Antoine Pitrouf068f942010-01-13 14:19:12 +0000838#if LONG_BIT >= 128
839#define BLOOM_WIDTH 128
840#elif LONG_BIT >= 64
841#define BLOOM_WIDTH 64
842#elif LONG_BIT >= 32
843#define BLOOM_WIDTH 32
844#else
845#error "LONG_BIT is smaller than 32"
846#endif
847
Thomas Wouters477c8d52006-05-27 19:21:47 +0000848#define BLOOM_MASK unsigned long
849
Serhiy Storchaka05997252013-01-26 12:14:02 +0200850static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851
Antoine Pitrouf068f942010-01-13 14:19:12 +0000852#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
Benjamin Peterson29060642009-01-31 22:14:21 +0000854#define BLOOM_LINEBREAK(ch) \
855 ((ch) < 128U ? ascii_linebreak[(ch)] : \
856 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700858static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860{
Victor Stinnera85af502013-04-09 21:53:54 +0200861#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
862 do { \
863 TYPE *data = (TYPE *)PTR; \
864 TYPE *end = data + LEN; \
865 Py_UCS4 ch; \
866 for (; data != end; data++) { \
867 ch = *data; \
868 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
869 } \
870 break; \
871 } while (0)
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873 /* calculate simple bloom-style bitmask for a given unicode string */
874
Antoine Pitrouf068f942010-01-13 14:19:12 +0000875 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
877 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200878 switch (kind) {
879 case PyUnicode_1BYTE_KIND:
880 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
881 break;
882 case PyUnicode_2BYTE_KIND:
883 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
884 break;
885 case PyUnicode_4BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
887 break;
888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700889 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200892
893#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894}
895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896static int
897ensure_unicode(PyObject *obj)
898{
899 if (!PyUnicode_Check(obj)) {
900 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200901 "must be str, not %.100s",
902 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903 return -1;
904 }
905 return PyUnicode_READY(obj);
906}
907
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200908/* Compilation of templated routines */
909
910#include "stringlib/asciilib.h"
911#include "stringlib/fastsearch.h"
912#include "stringlib/partition.h"
913#include "stringlib/split.h"
914#include "stringlib/count.h"
915#include "stringlib/find.h"
916#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/undef.h"
918
919#include "stringlib/ucs1lib.h"
920#include "stringlib/fastsearch.h"
921#include "stringlib/partition.h"
922#include "stringlib/split.h"
923#include "stringlib/count.h"
924#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300925#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200927#include "stringlib/undef.h"
928
929#include "stringlib/ucs2lib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300935#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/undef.h"
938
939#include "stringlib/ucs4lib.h"
940#include "stringlib/fastsearch.h"
941#include "stringlib/partition.h"
942#include "stringlib/split.h"
943#include "stringlib/count.h"
944#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300945#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/undef.h"
948
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200949#include "stringlib/unicodedefs.h"
950#include "stringlib/fastsearch.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100953#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- Unicode Object ----------------------------------------------------- */
956
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700957static inline Py_ssize_t
958findchar(const void *s, int kind,
959 Py_ssize_t size, Py_UCS4 ch,
960 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962 switch (kind) {
963 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS1) ch != ch)
965 return -1;
966 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600967 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200968 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600969 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS2) ch != ch)
972 return -1;
973 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600974 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200975 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600976 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600979 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200980 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600981 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700983 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinnerafffce42012-10-03 23:03:17 +0200987#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000988/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200989 earlier.
990
991 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
992 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
993 invalid character in Unicode 6.0. */
994static void
995unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
996{
997 int kind = PyUnicode_KIND(unicode);
998 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
999 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1000 if (length <= old_length)
1001 return;
1002 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1003}
1004#endif
1005
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006static PyObject*
1007resize_compact(PyObject *unicode, Py_ssize_t length)
1008{
1009 Py_ssize_t char_size;
1010 Py_ssize_t struct_size;
1011 Py_ssize_t new_size;
1012 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001013 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
1015 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1016#endif
1017
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001020 assert(PyUnicode_IS_COMPACT(unicode));
1021
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001022 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001023 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024 struct_size = sizeof(PyASCIIObject);
1025 else
1026 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1030 PyErr_NoMemory();
1031 return NULL;
1032 }
1033 new_size = (struct_size + (length + 1) * char_size);
1034
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001035 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1036 PyObject_DEL(_PyUnicode_UTF8(unicode));
1037 _PyUnicode_UTF8(unicode) = NULL;
1038 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1039 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001040#ifdef Py_REF_DEBUG
1041 _Py_RefTotal--;
1042#endif
1043#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001044 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001045#endif
Victor Stinner84def372011-12-11 20:04:56 +01001046
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001047 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001048 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001049 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050 PyErr_NoMemory();
1051 return NULL;
1052 }
Victor Stinner84def372011-12-11 20:04:56 +01001053 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001057 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001059 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001060 _PyUnicode_WSTR_LENGTH(unicode) = length;
1061 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001062 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1063 PyObject_DEL(_PyUnicode_WSTR(unicode));
1064 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001065 if (!PyUnicode_IS_ASCII(unicode))
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001067 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001068#ifdef Py_DEBUG
1069 unicode_fill_invalid(unicode, old_length);
1070#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1072 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001073 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 return unicode;
1075}
1076
Alexander Belopolsky40018472011-02-26 01:02:56 +00001077static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079{
Victor Stinner95663112011-10-04 01:03:50 +02001080 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001081 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001084
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 if (PyUnicode_IS_READY(unicode)) {
1086 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001087 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001089#ifdef Py_DEBUG
1090 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1091#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092
1093 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001094 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001095 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1096 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001097
1098 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1099 PyErr_NoMemory();
1100 return -1;
1101 }
1102 new_size = (length + 1) * char_size;
1103
Victor Stinner7a9105a2011-12-12 00:13:42 +01001104 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1105 {
1106 PyObject_DEL(_PyUnicode_UTF8(unicode));
1107 _PyUnicode_UTF8(unicode) = NULL;
1108 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1109 }
1110
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 data = (PyObject *)PyObject_REALLOC(data, new_size);
1112 if (data == NULL) {
1113 PyErr_NoMemory();
1114 return -1;
1115 }
1116 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001117 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 _PyUnicode_WSTR_LENGTH(unicode) = length;
1120 }
1121 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001122 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 _PyUnicode_UTF8_LENGTH(unicode) = length;
1124 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_LENGTH(unicode) = length;
1126 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001127#ifdef Py_DEBUG
1128 unicode_fill_invalid(unicode, old_length);
1129#endif
Victor Stinner95663112011-10-04 01:03:50 +02001130 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001131 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001133 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 }
Victor Stinner95663112011-10-04 01:03:50 +02001135 assert(_PyUnicode_WSTR(unicode) != NULL);
1136
1137 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001138 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001139 PyErr_NoMemory();
1140 return -1;
1141 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001142 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001143 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001144 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001145 if (!wstr) {
1146 PyErr_NoMemory();
1147 return -1;
1148 }
1149 _PyUnicode_WSTR(unicode) = wstr;
1150 _PyUnicode_WSTR(unicode)[length] = 0;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001152 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 return 0;
1154}
1155
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156static PyObject*
1157resize_copy(PyObject *unicode, Py_ssize_t length)
1158{
1159 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001160 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001162
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001163 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001164
1165 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1166 if (copy == NULL)
1167 return NULL;
1168
1169 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001170 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001172 }
1173 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001174 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001175
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001176 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 if (w == NULL)
1178 return NULL;
1179 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1180 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001181 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001182 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 }
1185}
1186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001188 Ux0000 terminated; some code (e.g. new_identifier)
1189 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001192 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193
1194*/
1195
Alexander Belopolsky40018472011-02-26 01:02:56 +00001196static PyUnicodeObject *
1197_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001199 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
Thomas Wouters477c8d52006-05-27 19:21:47 +00001202 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 if (length == 0 && unicode_empty != NULL) {
1204 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001205 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 }
1207
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001208 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001209 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001210 return (PyUnicodeObject *)PyErr_NoMemory();
1211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 if (length < 0) {
1213 PyErr_SetString(PyExc_SystemError,
1214 "Negative size passed to _PyUnicode_New");
1215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
1217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1219 if (unicode == NULL)
1220 return NULL;
1221 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001222
1223 _PyUnicode_WSTR_LENGTH(unicode) = length;
1224 _PyUnicode_HASH(unicode) = -1;
1225 _PyUnicode_STATE(unicode).interned = 0;
1226 _PyUnicode_STATE(unicode).kind = 0;
1227 _PyUnicode_STATE(unicode).compact = 0;
1228 _PyUnicode_STATE(unicode).ready = 0;
1229 _PyUnicode_STATE(unicode).ascii = 0;
1230 _PyUnicode_DATA_ANY(unicode) = NULL;
1231 _PyUnicode_LENGTH(unicode) = 0;
1232 _PyUnicode_UTF8(unicode) = NULL;
1233 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1236 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001237 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001239 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241
Jeremy Hyltond8082792003-09-16 19:41:39 +00001242 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001243 * the caller fails before initializing str -- unicode_resize()
1244 * reads str[0], and the Keep-Alive optimization can keep memory
1245 * allocated for str alive across a call to unicode_dealloc(unicode).
1246 * We don't want unicode_resize to read uninitialized memory in
1247 * that case.
1248 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 _PyUnicode_WSTR(unicode)[0] = 0;
1250 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001251
Victor Stinner7931d9a2011-11-04 00:22:48 +01001252 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 return unicode;
1254}
1255
Victor Stinnerf42dc442011-10-02 23:33:16 +02001256static const char*
1257unicode_kind_name(PyObject *unicode)
1258{
Victor Stinner42dfd712011-10-03 14:41:45 +02001259 /* don't check consistency: unicode_kind_name() is called from
1260 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001261 if (!PyUnicode_IS_COMPACT(unicode))
1262 {
1263 if (!PyUnicode_IS_READY(unicode))
1264 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001265 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001266 {
1267 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001268 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001269 return "legacy ascii";
1270 else
1271 return "legacy latin1";
1272 case PyUnicode_2BYTE_KIND:
1273 return "legacy UCS2";
1274 case PyUnicode_4BYTE_KIND:
1275 return "legacy UCS4";
1276 default:
1277 return "<legacy invalid kind>";
1278 }
1279 }
1280 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001281 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "ascii";
1285 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001286 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001288 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 default:
1292 return "<invalid compact kind>";
1293 }
1294}
1295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001298char *_PyUnicode_utf8(void *unicode_raw){
1299 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001300 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinnera42de742018-11-22 10:25:22 +01001303void *_PyUnicode_compact_data(void *unicode_raw) {
1304 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return _PyUnicode_COMPACT_DATA(unicode);
1306}
Victor Stinnera42de742018-11-22 10:25:22 +01001307void *_PyUnicode_data(void *unicode_raw) {
1308 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001309 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1311 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1312 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1313 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1314 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1315 return PyUnicode_DATA(unicode);
1316}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001317
1318void
1319_PyUnicode_Dump(PyObject *op)
1320{
1321 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1324 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001325
Victor Stinnera849a4b2011-10-03 12:12:11 +02001326 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001327 {
1328 if (ascii->state.ascii)
1329 data = (ascii + 1);
1330 else
1331 data = (compact + 1);
1332 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 else
1334 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1336 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001337
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 if (ascii->wstr == data)
1339 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001340 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001341
Victor Stinnera3b334d2011-10-03 13:53:37 +02001342 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001343 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1345 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001346 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001348 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001350}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351#endif
1352
1353PyObject *
1354PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1355{
1356 PyObject *obj;
1357 PyCompactUnicodeObject *unicode;
1358 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001359 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001360 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 Py_ssize_t char_size;
1362 Py_ssize_t struct_size;
1363
1364 /* Optimization for empty strings */
1365 if (size == 0 && unicode_empty != NULL) {
1366 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001367 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 }
1369
Victor Stinner9e9d6892011-10-04 01:02:02 +02001370 is_ascii = 0;
1371 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 struct_size = sizeof(PyCompactUnicodeObject);
1373 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001374 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 char_size = 1;
1376 is_ascii = 1;
1377 struct_size = sizeof(PyASCIIObject);
1378 }
1379 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 1;
1382 }
1383 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 2;
1386 if (sizeof(wchar_t) == 2)
1387 is_sharing = 1;
1388 }
1389 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001390 if (maxchar > MAX_UNICODE) {
1391 PyErr_SetString(PyExc_SystemError,
1392 "invalid maximum character passed to PyUnicode_New");
1393 return NULL;
1394 }
Victor Stinner8f825062012-04-27 13:55:39 +02001395 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 char_size = 4;
1397 if (sizeof(wchar_t) == 4)
1398 is_sharing = 1;
1399 }
1400
1401 /* Ensure we won't overflow the size. */
1402 if (size < 0) {
1403 PyErr_SetString(PyExc_SystemError,
1404 "Negative size passed to PyUnicode_New");
1405 return NULL;
1406 }
1407 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1408 return PyErr_NoMemory();
1409
1410 /* Duplicated allocation code from _PyObject_New() instead of a call to
1411 * PyObject_New() so we are able to allocate space for the object and
1412 * it's data buffer.
1413 */
1414 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1415 if (obj == NULL)
1416 return PyErr_NoMemory();
1417 obj = PyObject_INIT(obj, &PyUnicode_Type);
1418 if (obj == NULL)
1419 return NULL;
1420
1421 unicode = (PyCompactUnicodeObject *)obj;
1422 if (is_ascii)
1423 data = ((PyASCIIObject*)obj) + 1;
1424 else
1425 data = unicode + 1;
1426 _PyUnicode_LENGTH(unicode) = size;
1427 _PyUnicode_HASH(unicode) = -1;
1428 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001429 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 _PyUnicode_STATE(unicode).compact = 1;
1431 _PyUnicode_STATE(unicode).ready = 1;
1432 _PyUnicode_STATE(unicode).ascii = is_ascii;
1433 if (is_ascii) {
1434 ((char*)data)[size] = 0;
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 }
Victor Stinner8f825062012-04-27 13:55:39 +02001437 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 ((char*)data)[size] = 0;
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001442 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 else {
1445 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001446 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001447 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001449 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 ((Py_UCS4*)data)[size] = 0;
1451 if (is_sharing) {
1452 _PyUnicode_WSTR_LENGTH(unicode) = size;
1453 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1454 }
1455 else {
1456 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 }
1459 }
Victor Stinner8f825062012-04-27 13:55:39 +02001460#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001461 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001462#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001463 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 return obj;
1465}
1466
1467#if SIZEOF_WCHAR_T == 2
1468/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1469 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001470 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 This function assumes that unicode can hold one more code point than wstr
1473 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001474static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477{
1478 const wchar_t *iter;
1479 Py_UCS4 *ucs4_out;
1480
Victor Stinner910337b2011-10-03 03:20:16 +02001481 assert(unicode != NULL);
1482 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1484 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1485
1486 for (iter = begin; iter < end; ) {
1487 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1488 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001489 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1490 && (iter+1) < end
1491 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 {
Victor Stinner551ac952011-11-29 22:58:13 +01001493 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 iter += 2;
1495 }
1496 else {
1497 *ucs4_out++ = *iter;
1498 iter++;
1499 }
1500 }
1501 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1502 _PyUnicode_GET_LENGTH(unicode)));
1503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504}
1505#endif
1506
Victor Stinnercd9950f2011-10-02 00:34:53 +02001507static int
Victor Stinner488fa492011-12-12 00:01:39 +01001508unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001509{
Victor Stinner488fa492011-12-12 00:01:39 +01001510 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001511 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001512 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001513 return -1;
1514 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001515 return 0;
1516}
1517
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518static int
1519_copy_characters(PyObject *to, Py_ssize_t to_start,
1520 PyObject *from, Py_ssize_t from_start,
1521 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001523 unsigned int from_kind, to_kind;
1524 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinneree4544c2012-05-09 22:24:08 +02001526 assert(0 <= how_many);
1527 assert(0 <= from_start);
1528 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001531 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532
Victor Stinnerd3f08822012-05-29 12:57:52 +02001533 assert(PyUnicode_Check(to));
1534 assert(PyUnicode_IS_READY(to));
1535 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1536
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001537 if (how_many == 0)
1538 return 0;
1539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001541 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001543 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerf1852262012-06-16 16:38:26 +02001545#ifdef Py_DEBUG
1546 if (!check_maxchar
1547 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1548 {
1549 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1550 Py_UCS4 ch;
1551 Py_ssize_t i;
1552 for (i=0; i < how_many; i++) {
1553 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1554 assert(ch <= to_maxchar);
1555 }
1556 }
1557#endif
1558
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001560 if (check_maxchar
1561 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1562 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001563 /* Writing Latin-1 characters into an ASCII string requires to
1564 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001565 Py_UCS4 max_char;
1566 max_char = ucs1lib_find_max_char(from_data,
1567 (Py_UCS1*)from_data + how_many);
1568 if (max_char >= 128)
1569 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 }
Christian Heimesf051e432016-09-13 20:22:02 +02001571 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001572 (char*)from_data + from_kind * from_start,
1573 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001575 else if (from_kind == PyUnicode_1BYTE_KIND
1576 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001577 {
1578 _PyUnicode_CONVERT_BYTES(
1579 Py_UCS1, Py_UCS2,
1580 PyUnicode_1BYTE_DATA(from) + from_start,
1581 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1582 PyUnicode_2BYTE_DATA(to) + to_start
1583 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001584 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001585 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001586 && to_kind == PyUnicode_4BYTE_KIND)
1587 {
1588 _PyUnicode_CONVERT_BYTES(
1589 Py_UCS1, Py_UCS4,
1590 PyUnicode_1BYTE_DATA(from) + from_start,
1591 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1592 PyUnicode_4BYTE_DATA(to) + to_start
1593 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001594 }
1595 else if (from_kind == PyUnicode_2BYTE_KIND
1596 && to_kind == PyUnicode_4BYTE_KIND)
1597 {
1598 _PyUnicode_CONVERT_BYTES(
1599 Py_UCS2, Py_UCS4,
1600 PyUnicode_2BYTE_DATA(from) + from_start,
1601 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1602 PyUnicode_4BYTE_DATA(to) + to_start
1603 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001604 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001605 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001606 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1607
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001608 if (!check_maxchar) {
1609 if (from_kind == PyUnicode_2BYTE_KIND
1610 && to_kind == PyUnicode_1BYTE_KIND)
1611 {
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS2, Py_UCS1,
1614 PyUnicode_2BYTE_DATA(from) + from_start,
1615 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1616 PyUnicode_1BYTE_DATA(to) + to_start
1617 );
1618 }
1619 else if (from_kind == PyUnicode_4BYTE_KIND
1620 && to_kind == PyUnicode_1BYTE_KIND)
1621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS4, Py_UCS1,
1624 PyUnicode_4BYTE_DATA(from) + from_start,
1625 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_1BYTE_DATA(to) + to_start
1627 );
1628 }
1629 else if (from_kind == PyUnicode_4BYTE_KIND
1630 && to_kind == PyUnicode_2BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS4, Py_UCS2,
1634 PyUnicode_4BYTE_DATA(from) + from_start,
1635 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_2BYTE_DATA(to) + to_start
1637 );
1638 }
1639 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001640 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 }
1642 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001643 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001644 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 Py_ssize_t i;
1647
Victor Stinnera0702ab2011-09-29 14:14:38 +02001648 for (i=0; i < how_many; i++) {
1649 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 if (ch > to_maxchar)
1651 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1653 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 }
1655 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001656 return 0;
1657}
1658
Victor Stinnerd3f08822012-05-29 12:57:52 +02001659void
1660_PyUnicode_FastCopyCharacters(
1661 PyObject *to, Py_ssize_t to_start,
1662 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001663{
1664 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1665}
1666
1667Py_ssize_t
1668PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1669 PyObject *from, Py_ssize_t from_start,
1670 Py_ssize_t how_many)
1671{
1672 int err;
1673
1674 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1675 PyErr_BadInternalCall();
1676 return -1;
1677 }
1678
Benjamin Petersonbac79492012-01-14 13:34:47 -05001679 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001680 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001681 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001682 return -1;
1683
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001684 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001685 PyErr_SetString(PyExc_IndexError, "string index out of range");
1686 return -1;
1687 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001688 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001689 PyErr_SetString(PyExc_IndexError, "string index out of range");
1690 return -1;
1691 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001692 if (how_many < 0) {
1693 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1694 return -1;
1695 }
1696 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1698 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001699 "Cannot write %zi characters at %zi "
1700 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001701 how_many, to_start, PyUnicode_GET_LENGTH(to));
1702 return -1;
1703 }
1704
1705 if (how_many == 0)
1706 return 0;
1707
Victor Stinner488fa492011-12-12 00:01:39 +01001708 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 return -1;
1710
1711 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1712 if (err) {
1713 PyErr_Format(PyExc_SystemError,
1714 "Cannot copy %s characters "
1715 "into a string of %s characters",
1716 unicode_kind_name(from),
1717 unicode_kind_name(to));
1718 return -1;
1719 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001720 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721}
1722
Victor Stinner17222162011-09-28 22:15:37 +02001723/* Find the maximum code point and count the number of surrogate pairs so a
1724 correct string length can be computed before converting a string to UCS4.
1725 This function counts single surrogates as a character and not as a pair.
1726
1727 Return 0 on success, or -1 on error. */
1728static int
1729find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1730 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731{
1732 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001733 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Victor Stinnerc53be962011-10-02 21:33:54 +02001735 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 *num_surrogates = 0;
1737 *maxchar = 0;
1738
1739 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001741 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1742 && (iter+1) < end
1743 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1744 {
1745 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1746 ++(*num_surrogates);
1747 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 }
1749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001751 {
1752 ch = *iter;
1753 iter++;
1754 }
1755 if (ch > *maxchar) {
1756 *maxchar = ch;
1757 if (*maxchar > MAX_UNICODE) {
1758 PyErr_Format(PyExc_ValueError,
1759 "character U+%x is not in range [U+0000; U+10ffff]",
1760 ch);
1761 return -1;
1762 }
1763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 }
1765 return 0;
1766}
1767
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001768int
1769_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770{
1771 wchar_t *end;
1772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774#if SIZEOF_WCHAR_T == 2
1775 Py_ssize_t length_wo_surrogates;
1776#endif
1777
Georg Brandl7597add2011-10-05 16:36:47 +02001778 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001779 strings were created using _PyObject_New() and where no canonical
1780 representation (the str field) has been set yet aka strings
1781 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001782 assert(_PyUnicode_CHECK(unicode));
1783 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 /* Actually, it should neither be interned nor be anything else: */
1788 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001791 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794
1795 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001796 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1797 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 PyErr_NoMemory();
1799 return -1;
1800 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001801 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 _PyUnicode_WSTR(unicode), end,
1803 PyUnicode_1BYTE_DATA(unicode));
1804 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1805 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1806 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1807 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001808 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001809 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 }
1812 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001813 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001814 _PyUnicode_UTF8(unicode) = NULL;
1815 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 }
1817 PyObject_FREE(_PyUnicode_WSTR(unicode));
1818 _PyUnicode_WSTR(unicode) = NULL;
1819 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1820 }
1821 /* In this case we might have to convert down from 4-byte native
1822 wchar_t to 2-byte unicode. */
1823 else if (maxchar < 65536) {
1824 assert(num_surrogates == 0 &&
1825 "FindMaxCharAndNumSurrogatePairs() messed up");
1826
Victor Stinner506f5922011-09-28 22:34:18 +02001827#if SIZEOF_WCHAR_T == 2
1828 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001830 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1831 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1832 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8(unicode) = NULL;
1834 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001835#else
1836 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001838 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001839 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyErr_NoMemory();
1841 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 }
Victor Stinner506f5922011-09-28 22:34:18 +02001843 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1844 _PyUnicode_WSTR(unicode), end,
1845 PyUnicode_2BYTE_DATA(unicode));
1846 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1847 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1848 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001849 _PyUnicode_UTF8(unicode) = NULL;
1850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001851 PyObject_FREE(_PyUnicode_WSTR(unicode));
1852 _PyUnicode_WSTR(unicode) = NULL;
1853 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1854#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 }
1856 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1857 else {
1858#if SIZEOF_WCHAR_T == 2
1859 /* in case the native representation is 2-bytes, we need to allocate a
1860 new normalized 4-byte version. */
1861 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001862 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1863 PyErr_NoMemory();
1864 return -1;
1865 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001866 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1867 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyErr_NoMemory();
1869 return -1;
1870 }
1871 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1872 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001873 _PyUnicode_UTF8(unicode) = NULL;
1874 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001875 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1876 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001877 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 PyObject_FREE(_PyUnicode_WSTR(unicode));
1879 _PyUnicode_WSTR(unicode) = NULL;
1880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1881#else
1882 assert(num_surrogates == 0);
1883
Victor Stinnerc3c74152011-10-02 20:39:55 +02001884 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001886 _PyUnicode_UTF8(unicode) = NULL;
1887 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1889#endif
1890 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1891 }
1892 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001893 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 return 0;
1895}
1896
Alexander Belopolsky40018472011-02-26 01:02:56 +00001897static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001898unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899{
Walter Dörwald16807132007-05-25 13:52:07 +00001900 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_NOT_INTERNED:
1902 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001903
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 case SSTATE_INTERNED_MORTAL:
1905 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001906 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001907 if (PyDict_DelItem(interned, unicode) != 0) {
1908 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1909 NULL);
1910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001912
Benjamin Peterson29060642009-01-31 22:14:21 +00001913 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001914 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1915 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001916
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001918 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001919 }
1920
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001921 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001923 }
1924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001925 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001926 }
1927 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001928 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001931 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932}
1933
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001934#ifdef Py_DEBUG
1935static int
1936unicode_is_singleton(PyObject *unicode)
1937{
1938 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1939 if (unicode == unicode_empty)
1940 return 1;
1941 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1942 {
1943 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1944 if (ch < 256 && unicode_latin1[ch] == unicode)
1945 return 1;
1946 }
1947 return 0;
1948}
1949#endif
1950
Alexander Belopolsky40018472011-02-26 01:02:56 +00001951static int
Victor Stinner488fa492011-12-12 00:01:39 +01001952unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001953{
Victor Stinner488fa492011-12-12 00:01:39 +01001954 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955 if (Py_REFCNT(unicode) != 1)
1956 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001957 if (_PyUnicode_HASH(unicode) != -1)
1958 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001959 if (PyUnicode_CHECK_INTERNED(unicode))
1960 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001961 if (!PyUnicode_CheckExact(unicode))
1962 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001963#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001964 /* singleton refcount is greater than 1 */
1965 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001966#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 return 1;
1968}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001969
Victor Stinnerfe226c02011-10-03 03:52:20 +02001970static int
1971unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1972{
1973 PyObject *unicode;
1974 Py_ssize_t old_length;
1975
1976 assert(p_unicode != NULL);
1977 unicode = *p_unicode;
1978
1979 assert(unicode != NULL);
1980 assert(PyUnicode_Check(unicode));
1981 assert(0 <= length);
1982
Victor Stinner910337b2011-10-03 03:20:16 +02001983 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 old_length = PyUnicode_WSTR_LENGTH(unicode);
1985 else
1986 old_length = PyUnicode_GET_LENGTH(unicode);
1987 if (old_length == length)
1988 return 0;
1989
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001990 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001991 _Py_INCREF_UNICODE_EMPTY();
1992 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001994 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001995 return 0;
1996 }
1997
Victor Stinner488fa492011-12-12 00:01:39 +01001998 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001999 PyObject *copy = resize_copy(unicode, length);
2000 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002002 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004 }
2005
Victor Stinnerfe226c02011-10-03 03:52:20 +02002006 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002007 PyObject *new_unicode = resize_compact(unicode, length);
2008 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002010 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002013 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002014}
2015
Alexander Belopolsky40018472011-02-26 01:02:56 +00002016int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002018{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 PyObject *unicode;
2020 if (p_unicode == NULL) {
2021 PyErr_BadInternalCall();
2022 return -1;
2023 }
2024 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026 {
2027 PyErr_BadInternalCall();
2028 return -1;
2029 }
2030 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002031}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002032
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002033/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002034
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002035 WARNING: The function doesn't copy the terminating null character and
2036 doesn't check the maximum character (may write a latin1 character in an
2037 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002038static void
2039unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2040 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002041{
2042 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2043 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002044 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002045
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002046 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002047 switch (kind) {
2048 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002049#ifdef Py_DEBUG
2050 if (PyUnicode_IS_ASCII(unicode)) {
2051 Py_UCS4 maxchar = ucs1lib_find_max_char(
2052 (const Py_UCS1*)str,
2053 (const Py_UCS1*)str + len);
2054 assert(maxchar < 128);
2055 }
2056#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002057 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002058 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002059 }
2060 case PyUnicode_2BYTE_KIND: {
2061 Py_UCS2 *start = (Py_UCS2 *)data + index;
2062 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002063
Victor Stinner184252a2012-06-16 02:57:41 +02002064 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002065 *ucs2 = (Py_UCS2)*str;
2066
2067 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002068 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002069 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002070 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002071 Py_UCS4 *start = (Py_UCS4 *)data + index;
2072 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002073
Victor Stinner184252a2012-06-16 02:57:41 +02002074 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002075 *ucs4 = (Py_UCS4)*str;
2076
2077 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002078 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002079 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002080 default:
2081 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002082 }
2083}
2084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085static PyObject*
2086get_latin1_char(unsigned char ch)
2087{
Victor Stinnera464fc12011-10-02 20:39:30 +02002088 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002090 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (!unicode)
2092 return NULL;
2093 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002094 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 unicode_latin1[ch] = unicode;
2096 }
2097 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002098 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099}
2100
Victor Stinner985a82a2014-01-03 12:53:47 +01002101static PyObject*
2102unicode_char(Py_UCS4 ch)
2103{
2104 PyObject *unicode;
2105
2106 assert(ch <= MAX_UNICODE);
2107
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002108 if (ch < 256)
2109 return get_latin1_char(ch);
2110
Victor Stinner985a82a2014-01-03 12:53:47 +01002111 unicode = PyUnicode_New(1, ch);
2112 if (unicode == NULL)
2113 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002114
2115 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2116 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002117 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002118 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002119 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2120 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2121 }
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
2124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002129 if (u == NULL)
2130 return (PyObject*)_PyUnicode_New(size);
2131
2132 if (size < 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 return PyUnicode_FromWideChar(u, size);
2138}
2139
2140PyObject *
2141PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2142{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002143 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 Py_UCS4 maxchar = 0;
2145 Py_ssize_t num_surrogates;
2146
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002147 if (u == NULL && size != 0) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
2151
2152 if (size == -1) {
2153 size = wcslen(u);
2154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002156 /* If the Unicode data is known at construction time, we can apply
2157 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 /* Single character Unicode objects in the Latin-1 range are
2164 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002165 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return get_latin1_char((unsigned char)*u);
2167
2168 /* If not empty and not single character, copy the Unicode data
2169 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002170 if (find_maxchar_surrogates(u, u + size,
2171 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return NULL;
2173
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!unicode)
2176 return NULL;
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 switch (PyUnicode_KIND(unicode)) {
2179 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002180 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2182 break;
2183 case PyUnicode_2BYTE_KIND:
2184#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002185 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002187 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2189#endif
2190 break;
2191 case PyUnicode_4BYTE_KIND:
2192#if SIZEOF_WCHAR_T == 2
2193 /* This is the only case which has to process surrogates, thus
2194 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002195 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196#else
2197 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002198 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199#endif
2200 break;
2201 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002202 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206}
2207
Alexander Belopolsky40018472011-02-26 01:02:56 +00002208PyObject *
2209PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002210{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 if (size < 0) {
2212 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 return NULL;
2215 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002216 if (u != NULL)
2217 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2218 else
2219 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002220}
2221
Alexander Belopolsky40018472011-02-26 01:02:56 +00002222PyObject *
2223PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224{
2225 size_t size = strlen(u);
2226 if (size > PY_SSIZE_T_MAX) {
2227 PyErr_SetString(PyExc_OverflowError, "input too long");
2228 return NULL;
2229 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002230 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231}
2232
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002233PyObject *
2234_PyUnicode_FromId(_Py_Identifier *id)
2235{
2236 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002237 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2238 strlen(id->string),
2239 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002240 if (!id->object)
2241 return NULL;
2242 PyUnicode_InternInPlace(&id->object);
2243 assert(!id->next);
2244 id->next = static_strings;
2245 static_strings = id;
2246 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002247 return id->object;
2248}
2249
2250void
2251_PyUnicode_ClearStaticStrings()
2252{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002253 _Py_Identifier *tmp, *s = static_strings;
2254 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002255 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002256 tmp = s->next;
2257 s->next = NULL;
2258 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002259 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002260 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002261}
2262
Benjamin Peterson0df54292012-03-26 14:50:32 -04002263/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002264
Victor Stinnerd3f08822012-05-29 12:57:52 +02002265PyObject*
2266_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002267{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002268 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002269 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002270 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002271#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002272 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002273#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002274 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002275 }
Victor Stinner785938e2011-12-11 20:09:03 +01002276 unicode = PyUnicode_New(size, 127);
2277 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002278 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002279 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2280 assert(_PyUnicode_CheckConsistency(unicode, 1));
2281 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002282}
2283
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002284static Py_UCS4
2285kind_maxchar_limit(unsigned int kind)
2286{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002287 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002288 case PyUnicode_1BYTE_KIND:
2289 return 0x80;
2290 case PyUnicode_2BYTE_KIND:
2291 return 0x100;
2292 case PyUnicode_4BYTE_KIND:
2293 return 0x10000;
2294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002295 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002296 }
2297}
2298
Victor Stinner702c7342011-10-05 13:50:52 +02002299static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002300_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002303 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002304
Serhiy Storchaka678db842013-01-26 12:16:36 +02002305 if (size == 0)
2306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002307 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002308 if (size == 1)
2309 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002311 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002312 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 if (!res)
2314 return NULL;
2315 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002316 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002318}
2319
Victor Stinnere57b1c02011-09-28 22:20:48 +02002320static PyObject*
2321_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322{
2323 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002324 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002325
Serhiy Storchaka678db842013-01-26 12:16:36 +02002326 if (size == 0)
2327 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002328 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002329 if (size == 1)
2330 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002331
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002332 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002333 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 if (!res)
2335 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002338 else {
2339 _PyUnicode_CONVERT_BYTES(
2340 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2341 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002342 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 return res;
2344}
2345
Victor Stinnere57b1c02011-09-28 22:20:48 +02002346static PyObject*
2347_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348{
2349 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002350 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002351
Serhiy Storchaka678db842013-01-26 12:16:36 +02002352 if (size == 0)
2353 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002354 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002355 if (size == 1)
2356 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002357
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 if (!res)
2361 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002362 if (max_char < 256)
2363 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2364 PyUnicode_1BYTE_DATA(res));
2365 else if (max_char < 0x10000)
2366 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2367 PyUnicode_2BYTE_DATA(res));
2368 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002370 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return res;
2372}
2373
2374PyObject*
2375PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2376{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002377 if (size < 0) {
2378 PyErr_SetString(PyExc_ValueError, "size must be positive");
2379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002383 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002385 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002387 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002388 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002389 PyErr_SetString(PyExc_SystemError, "invalid kind");
2390 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392}
2393
Victor Stinnerece58de2012-04-23 23:36:38 +02002394Py_UCS4
2395_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2396{
2397 enum PyUnicode_Kind kind;
2398 void *startptr, *endptr;
2399
2400 assert(PyUnicode_IS_READY(unicode));
2401 assert(0 <= start);
2402 assert(end <= PyUnicode_GET_LENGTH(unicode));
2403 assert(start <= end);
2404
2405 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2406 return PyUnicode_MAX_CHAR_VALUE(unicode);
2407
2408 if (start == end)
2409 return 127;
2410
Victor Stinner94d558b2012-04-27 22:26:58 +02002411 if (PyUnicode_IS_ASCII(unicode))
2412 return 127;
2413
Victor Stinnerece58de2012-04-23 23:36:38 +02002414 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002415 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002416 endptr = (char *)startptr + end * kind;
2417 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002418 switch(kind) {
2419 case PyUnicode_1BYTE_KIND:
2420 return ucs1lib_find_max_char(startptr, endptr);
2421 case PyUnicode_2BYTE_KIND:
2422 return ucs2lib_find_max_char(startptr, endptr);
2423 case PyUnicode_4BYTE_KIND:
2424 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002425 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002426 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002427 }
2428}
2429
Victor Stinner25a4b292011-10-06 12:31:55 +02002430/* Ensure that a string uses the most efficient storage, if it is not the
2431 case: create a new string with of the right kind. Write NULL into *p_unicode
2432 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002433static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002434unicode_adjust_maxchar(PyObject **p_unicode)
2435{
2436 PyObject *unicode, *copy;
2437 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002438 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 unsigned int kind;
2440
2441 assert(p_unicode != NULL);
2442 unicode = *p_unicode;
2443 assert(PyUnicode_IS_READY(unicode));
2444 if (PyUnicode_IS_ASCII(unicode))
2445 return;
2446
2447 len = PyUnicode_GET_LENGTH(unicode);
2448 kind = PyUnicode_KIND(unicode);
2449 if (kind == PyUnicode_1BYTE_KIND) {
2450 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002451 max_char = ucs1lib_find_max_char(u, u + len);
2452 if (max_char >= 128)
2453 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002454 }
2455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002457 max_char = ucs2lib_find_max_char(u, u + len);
2458 if (max_char >= 256)
2459 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002460 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002461 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002462 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002463 max_char = ucs4lib_find_max_char(u, u + len);
2464 if (max_char >= 0x10000)
2465 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002466 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002467 else
2468 Py_UNREACHABLE();
2469
Victor Stinner25a4b292011-10-06 12:31:55 +02002470 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002471 if (copy != NULL)
2472 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002473 Py_DECREF(unicode);
2474 *p_unicode = copy;
2475}
2476
Victor Stinner034f6cf2011-09-30 02:26:44 +02002477PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002478_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002479{
Victor Stinner87af4f22011-11-21 23:03:47 +01002480 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002481 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002482
Victor Stinner034f6cf2011-09-30 02:26:44 +02002483 if (!PyUnicode_Check(unicode)) {
2484 PyErr_BadInternalCall();
2485 return NULL;
2486 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002487 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002488 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002489
Victor Stinner87af4f22011-11-21 23:03:47 +01002490 length = PyUnicode_GET_LENGTH(unicode);
2491 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002492 if (!copy)
2493 return NULL;
2494 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2495
Christian Heimesf051e432016-09-13 20:22:02 +02002496 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002497 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002498 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002499 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002500}
2501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502
Victor Stinnerbc603d12011-10-02 01:00:40 +02002503/* Widen Unicode objects to larger buffers. Don't write terminating null
2504 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002506static void*
2507unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002509 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002510
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002511 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002512 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002513 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002514 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002515 if (!result)
2516 return PyErr_NoMemory();
2517 assert(skind == PyUnicode_1BYTE_KIND);
2518 _PyUnicode_CONVERT_BYTES(
2519 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002520 (const Py_UCS1 *)data,
2521 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002522 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002524 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002525 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002526 if (!result)
2527 return PyErr_NoMemory();
2528 if (skind == PyUnicode_2BYTE_KIND) {
2529 _PyUnicode_CONVERT_BYTES(
2530 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002531 (const Py_UCS2 *)data,
2532 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002533 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002535 else {
2536 assert(skind == PyUnicode_1BYTE_KIND);
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002539 (const Py_UCS1 *)data,
2540 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002541 result);
2542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002544 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002545 Py_UNREACHABLE();
2546 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548}
2549
2550static Py_UCS4*
2551as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2552 int copy_null)
2553{
2554 int kind;
2555 void *data;
2556 Py_ssize_t len, targetlen;
2557 if (PyUnicode_READY(string) == -1)
2558 return NULL;
2559 kind = PyUnicode_KIND(string);
2560 data = PyUnicode_DATA(string);
2561 len = PyUnicode_GET_LENGTH(string);
2562 targetlen = len;
2563 if (copy_null)
2564 targetlen++;
2565 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002566 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 if (!target) {
2568 PyErr_NoMemory();
2569 return NULL;
2570 }
2571 }
2572 else {
2573 if (targetsize < targetlen) {
2574 PyErr_Format(PyExc_SystemError,
2575 "string is longer than the buffer");
2576 if (copy_null && 0 < targetsize)
2577 target[0] = 0;
2578 return NULL;
2579 }
2580 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002581 if (kind == PyUnicode_1BYTE_KIND) {
2582 Py_UCS1 *start = (Py_UCS1 *) data;
2583 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002585 else if (kind == PyUnicode_2BYTE_KIND) {
2586 Py_UCS2 *start = (Py_UCS2 *) data;
2587 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2588 }
2589 else {
2590 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002591 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 if (copy_null)
2594 target[len] = 0;
2595 return target;
2596}
2597
2598Py_UCS4*
2599PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2600 int copy_null)
2601{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002602 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 PyErr_BadInternalCall();
2604 return NULL;
2605 }
2606 return as_ucs4(string, target, targetsize, copy_null);
2607}
2608
2609Py_UCS4*
2610PyUnicode_AsUCS4Copy(PyObject *string)
2611{
2612 return as_ucs4(string, NULL, 0, 1);
2613}
2614
Victor Stinner15a11362012-10-06 23:48:20 +02002615/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002616 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2617 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2618#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002619
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002620static int
2621unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2622 Py_ssize_t width, Py_ssize_t precision)
2623{
2624 Py_ssize_t length, fill, arglen;
2625 Py_UCS4 maxchar;
2626
2627 if (PyUnicode_READY(str) == -1)
2628 return -1;
2629
2630 length = PyUnicode_GET_LENGTH(str);
2631 if ((precision == -1 || precision >= length)
2632 && width <= length)
2633 return _PyUnicodeWriter_WriteStr(writer, str);
2634
2635 if (precision != -1)
2636 length = Py_MIN(precision, length);
2637
2638 arglen = Py_MAX(length, width);
2639 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2640 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2641 else
2642 maxchar = writer->maxchar;
2643
2644 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2645 return -1;
2646
2647 if (width > length) {
2648 fill = width - length;
2649 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2650 return -1;
2651 writer->pos += fill;
2652 }
2653
2654 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2655 str, 0, length);
2656 writer->pos += length;
2657 return 0;
2658}
2659
2660static int
Victor Stinner998b8062018-09-12 00:23:25 +02002661unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002662 Py_ssize_t width, Py_ssize_t precision)
2663{
2664 /* UTF-8 */
2665 Py_ssize_t length;
2666 PyObject *unicode;
2667 int res;
2668
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002669 if (precision == -1) {
2670 length = strlen(str);
2671 }
2672 else {
2673 length = 0;
2674 while (length < precision && str[length]) {
2675 length++;
2676 }
2677 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002678 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2679 if (unicode == NULL)
2680 return -1;
2681
2682 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2683 Py_DECREF(unicode);
2684 return res;
2685}
2686
Victor Stinner96865452011-03-01 23:44:09 +00002687static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002688unicode_fromformat_arg(_PyUnicodeWriter *writer,
2689 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002690{
Victor Stinnere215d962012-10-06 23:03:36 +02002691 const char *p;
2692 Py_ssize_t len;
2693 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002694 Py_ssize_t width;
2695 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002696 int longflag;
2697 int longlongflag;
2698 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002699 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002700
2701 p = f;
2702 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002703 zeropad = 0;
2704 if (*f == '0') {
2705 zeropad = 1;
2706 f++;
2707 }
Victor Stinner96865452011-03-01 23:44:09 +00002708
2709 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002710 width = -1;
2711 if (Py_ISDIGIT((unsigned)*f)) {
2712 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002713 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002714 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002715 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002716 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002718 return NULL;
2719 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002721 f++;
2722 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002723 }
2724 precision = -1;
2725 if (*f == '.') {
2726 f++;
2727 if (Py_ISDIGIT((unsigned)*f)) {
2728 precision = (*f - '0');
2729 f++;
2730 while (Py_ISDIGIT((unsigned)*f)) {
2731 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2732 PyErr_SetString(PyExc_ValueError,
2733 "precision too big");
2734 return NULL;
2735 }
2736 precision = (precision * 10) + (*f - '0');
2737 f++;
2738 }
2739 }
Victor Stinner96865452011-03-01 23:44:09 +00002740 if (*f == '%') {
2741 /* "%.3%s" => f points to "3" */
2742 f--;
2743 }
2744 }
2745 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002746 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002747 f--;
2748 }
Victor Stinner96865452011-03-01 23:44:09 +00002749
2750 /* Handle %ld, %lu, %lld and %llu. */
2751 longflag = 0;
2752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002754 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002755 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002756 longflag = 1;
2757 ++f;
2758 }
Victor Stinner96865452011-03-01 23:44:09 +00002759 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002760 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002761 longlongflag = 1;
2762 f += 2;
2763 }
Victor Stinner96865452011-03-01 23:44:09 +00002764 }
2765 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002766 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002767 size_tflag = 1;
2768 ++f;
2769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770
2771 if (f[1] == '\0')
2772 writer->overallocate = 0;
2773
2774 switch (*f) {
2775 case 'c':
2776 {
2777 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002778 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002779 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002780 "character argument not in range(0x110000)");
2781 return NULL;
2782 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002783 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002784 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002785 break;
2786 }
2787
2788 case 'i':
2789 case 'd':
2790 case 'u':
2791 case 'x':
2792 {
2793 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002794 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002796
2797 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002798 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002799 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002800 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002801 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002802 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002803 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002804 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002805 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002806 va_arg(*vargs, size_t));
2807 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002808 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002809 va_arg(*vargs, unsigned int));
2810 }
2811 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002812 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002813 }
2814 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002815 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002816 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002817 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002818 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002819 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002820 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002821 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002822 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002823 va_arg(*vargs, Py_ssize_t));
2824 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002825 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002826 va_arg(*vargs, int));
2827 }
2828 assert(len >= 0);
2829
Victor Stinnere215d962012-10-06 23:03:36 +02002830 if (precision < len)
2831 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002832
2833 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002834 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2835 return NULL;
2836
Victor Stinnere215d962012-10-06 23:03:36 +02002837 if (width > precision) {
2838 Py_UCS4 fillchar;
2839 fill = width - precision;
2840 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002841 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2842 return NULL;
2843 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002844 }
Victor Stinner15a11362012-10-06 23:48:20 +02002845 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002846 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002847 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2848 return NULL;
2849 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002850 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002851
Victor Stinner4a587072013-11-19 12:54:53 +01002852 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2853 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002854 break;
2855 }
2856
2857 case 'p':
2858 {
2859 char number[MAX_LONG_LONG_CHARS];
2860
2861 len = sprintf(number, "%p", va_arg(*vargs, void*));
2862 assert(len >= 0);
2863
2864 /* %p is ill-defined: ensure leading 0x. */
2865 if (number[1] == 'X')
2866 number[1] = 'x';
2867 else if (number[1] != 'x') {
2868 memmove(number + 2, number,
2869 strlen(number) + 1);
2870 number[0] = '0';
2871 number[1] = 'x';
2872 len += 2;
2873 }
2874
Victor Stinner4a587072013-11-19 12:54:53 +01002875 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002876 return NULL;
2877 break;
2878 }
2879
2880 case 's':
2881 {
2882 /* UTF-8 */
2883 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002884 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002885 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002886 break;
2887 }
2888
2889 case 'U':
2890 {
2891 PyObject *obj = va_arg(*vargs, PyObject *);
2892 assert(obj && _PyUnicode_CHECK(obj));
2893
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002894 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002895 return NULL;
2896 break;
2897 }
2898
2899 case 'V':
2900 {
2901 PyObject *obj = va_arg(*vargs, PyObject *);
2902 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002903 if (obj) {
2904 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002905 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002906 return NULL;
2907 }
2908 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002909 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002910 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002911 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002912 }
2913 break;
2914 }
2915
2916 case 'S':
2917 {
2918 PyObject *obj = va_arg(*vargs, PyObject *);
2919 PyObject *str;
2920 assert(obj);
2921 str = PyObject_Str(obj);
2922 if (!str)
2923 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002924 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002925 Py_DECREF(str);
2926 return NULL;
2927 }
2928 Py_DECREF(str);
2929 break;
2930 }
2931
2932 case 'R':
2933 {
2934 PyObject *obj = va_arg(*vargs, PyObject *);
2935 PyObject *repr;
2936 assert(obj);
2937 repr = PyObject_Repr(obj);
2938 if (!repr)
2939 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002940 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002941 Py_DECREF(repr);
2942 return NULL;
2943 }
2944 Py_DECREF(repr);
2945 break;
2946 }
2947
2948 case 'A':
2949 {
2950 PyObject *obj = va_arg(*vargs, PyObject *);
2951 PyObject *ascii;
2952 assert(obj);
2953 ascii = PyObject_ASCII(obj);
2954 if (!ascii)
2955 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002956 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002957 Py_DECREF(ascii);
2958 return NULL;
2959 }
2960 Py_DECREF(ascii);
2961 break;
2962 }
2963
2964 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002965 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002966 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002967 break;
2968
2969 default:
2970 /* if we stumble upon an unknown formatting code, copy the rest
2971 of the format string to the output string. (we cannot just
2972 skip the code, since there's no way to know what's in the
2973 argument list) */
2974 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002975 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002976 return NULL;
2977 f = p+len;
2978 return f;
2979 }
2980
2981 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002982 return f;
2983}
2984
Walter Dörwaldd2034312007-05-18 16:29:38 +00002985PyObject *
2986PyUnicode_FromFormatV(const char *format, va_list vargs)
2987{
Victor Stinnere215d962012-10-06 23:03:36 +02002988 va_list vargs2;
2989 const char *f;
2990 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002991
Victor Stinner8f674cc2013-04-17 23:02:17 +02002992 _PyUnicodeWriter_Init(&writer);
2993 writer.min_length = strlen(format) + 100;
2994 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002995
Benjamin Peterson0c212142016-09-20 20:39:33 -07002996 // Copy varags to be able to pass a reference to a subfunction.
2997 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002998
2999 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003000 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003001 f = unicode_fromformat_arg(&writer, f, &vargs2);
3002 if (f == NULL)
3003 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003006 const char *p;
3007 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003008
Victor Stinnere215d962012-10-06 23:03:36 +02003009 p = f;
3010 do
3011 {
3012 if ((unsigned char)*p > 127) {
3013 PyErr_Format(PyExc_ValueError,
3014 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3015 "string, got a non-ASCII byte: 0x%02x",
3016 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003017 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003018 }
3019 p++;
3020 }
3021 while (*p != '\0' && *p != '%');
3022 len = p - f;
3023
3024 if (*p == '\0')
3025 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003026
3027 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003028 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003029
3030 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003032 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003033 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003034 return _PyUnicodeWriter_Finish(&writer);
3035
3036 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003037 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003038 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003039 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003040}
3041
Walter Dörwaldd2034312007-05-18 16:29:38 +00003042PyObject *
3043PyUnicode_FromFormat(const char *format, ...)
3044{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003045 PyObject* ret;
3046 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003047
3048#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003049 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003050#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003051 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003053 ret = PyUnicode_FromFormatV(format, vargs);
3054 va_end(vargs);
3055 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056}
3057
Serhiy Storchakac46db922018-10-23 22:58:24 +03003058static Py_ssize_t
3059unicode_get_widechar_size(PyObject *unicode)
3060{
3061 Py_ssize_t res;
3062
3063 assert(unicode != NULL);
3064 assert(_PyUnicode_CHECK(unicode));
3065
3066 if (_PyUnicode_WSTR(unicode) != NULL) {
3067 return PyUnicode_WSTR_LENGTH(unicode);
3068 }
3069 assert(PyUnicode_IS_READY(unicode));
3070
3071 res = _PyUnicode_LENGTH(unicode);
3072#if SIZEOF_WCHAR_T == 2
3073 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3074 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3075 const Py_UCS4 *end = s + res;
3076 for (; s < end; ++s) {
3077 if (*s > 0xFFFF) {
3078 ++res;
3079 }
3080 }
3081 }
3082#endif
3083 return res;
3084}
3085
3086static void
3087unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3088{
3089 const wchar_t *wstr;
3090
3091 assert(unicode != NULL);
3092 assert(_PyUnicode_CHECK(unicode));
3093
3094 wstr = _PyUnicode_WSTR(unicode);
3095 if (wstr != NULL) {
3096 memcpy(w, wstr, size * sizeof(wchar_t));
3097 return;
3098 }
3099 assert(PyUnicode_IS_READY(unicode));
3100
3101 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3102 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3103 for (; size--; ++s, ++w) {
3104 *w = *s;
3105 }
3106 }
3107 else {
3108#if SIZEOF_WCHAR_T == 4
3109 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3110 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3111 for (; size--; ++s, ++w) {
3112 *w = *s;
3113 }
3114#else
3115 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3116 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3117 for (; size--; ++s, ++w) {
3118 Py_UCS4 ch = *s;
3119 if (ch > 0xFFFF) {
3120 assert(ch <= MAX_UNICODE);
3121 /* encode surrogate pair in this case */
3122 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3123 if (!size--)
3124 break;
3125 *w = Py_UNICODE_LOW_SURROGATE(ch);
3126 }
3127 else {
3128 *w = ch;
3129 }
3130 }
3131#endif
3132 }
3133}
3134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003135#ifdef HAVE_WCHAR_H
3136
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003137/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003138
Victor Stinnerd88d9832011-09-06 02:00:05 +02003139 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003140 character) required to convert the unicode object. Ignore size argument.
3141
Victor Stinnerd88d9832011-09-06 02:00:05 +02003142 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003143 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003144 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003145Py_ssize_t
3146PyUnicode_AsWideChar(PyObject *unicode,
3147 wchar_t *w,
3148 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003149{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003151
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003152 if (unicode == NULL) {
3153 PyErr_BadInternalCall();
3154 return -1;
3155 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003156 if (!PyUnicode_Check(unicode)) {
3157 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003158 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003159 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003160
3161 res = unicode_get_widechar_size(unicode);
3162 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003164 }
3165
3166 if (size > res) {
3167 size = res + 1;
3168 }
3169 else {
3170 res = size;
3171 }
3172 unicode_copy_as_widechar(unicode, w, size);
3173 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003174}
3175
Victor Stinner137c34c2010-09-29 10:25:54 +00003176wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003177PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003178 Py_ssize_t *size)
3179{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003180 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003181 Py_ssize_t buflen;
3182
3183 if (unicode == NULL) {
3184 PyErr_BadInternalCall();
3185 return NULL;
3186 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003187 if (!PyUnicode_Check(unicode)) {
3188 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189 return NULL;
3190 }
3191
Serhiy Storchakac46db922018-10-23 22:58:24 +03003192 buflen = unicode_get_widechar_size(unicode);
3193 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003194 if (buffer == NULL) {
3195 PyErr_NoMemory();
3196 return NULL;
3197 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003198 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3199 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003200 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003201 }
3202 else if (wcslen(buffer) != (size_t)buflen) {
3203 PyMem_FREE(buffer);
3204 PyErr_SetString(PyExc_ValueError,
3205 "embedded null character");
3206 return NULL;
3207 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003208 return buffer;
3209}
3210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003211#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212
Alexander Belopolsky40018472011-02-26 01:02:56 +00003213PyObject *
3214PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003215{
Victor Stinner8faf8212011-12-08 22:14:11 +01003216 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 PyErr_SetString(PyExc_ValueError,
3218 "chr() arg not in range(0x110000)");
3219 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003220 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003221
Victor Stinner985a82a2014-01-03 12:53:47 +01003222 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003223}
3224
Alexander Belopolsky40018472011-02-26 01:02:56 +00003225PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003226PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003228 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003230 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003231 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003232 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 Py_INCREF(obj);
3234 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003235 }
3236 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 /* For a Unicode subtype that's not a Unicode object,
3238 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003239 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003240 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003241 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003242 "Can't convert '%.100s' object to str implicitly",
3243 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003244 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003245}
3246
Alexander Belopolsky40018472011-02-26 01:02:56 +00003247PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003248PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003249 const char *encoding,
3250 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003251{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003252 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003253 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003254
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 PyErr_BadInternalCall();
3257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003259
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003260 /* Decoding bytes objects is the most common case and should be fast */
3261 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003262 if (PyBytes_GET_SIZE(obj) == 0) {
3263 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3264 return NULL;
3265 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003266 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003267 }
3268 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003269 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3270 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003271 }
3272
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003273 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 PyErr_SetString(PyExc_TypeError,
3275 "decoding str is not supported");
3276 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003277 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003278
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003279 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3280 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3281 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003282 "decoding to str: need a bytes-like object, %.80s found",
3283 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003284 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003285 }
Tim Petersced69f82003-09-16 20:30:58 +00003286
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003287 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003288 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003289 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3290 return NULL;
3291 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003292 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003294
Serhiy Storchaka05997252013-01-26 12:14:02 +02003295 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003297 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298}
3299
Victor Stinnerebe17e02016-10-12 13:57:45 +02003300/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3301 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3302 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003303int
3304_Py_normalize_encoding(const char *encoding,
3305 char *lower,
3306 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003308 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003309 char *l;
3310 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003311 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312
Victor Stinner942889a2016-09-05 15:40:10 -07003313 assert(encoding != NULL);
3314
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003315 e = encoding;
3316 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003317 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003318 punct = 0;
3319 while (1) {
3320 char c = *e;
3321 if (c == 0) {
3322 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003323 }
Victor Stinner942889a2016-09-05 15:40:10 -07003324
3325 if (Py_ISALNUM(c) || c == '.') {
3326 if (punct && l != lower) {
3327 if (l == l_end) {
3328 return 0;
3329 }
3330 *l++ = '_';
3331 }
3332 punct = 0;
3333
3334 if (l == l_end) {
3335 return 0;
3336 }
3337 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003338 }
3339 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003340 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003341 }
Victor Stinner942889a2016-09-05 15:40:10 -07003342
3343 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003344 }
3345 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003346 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003347}
3348
Alexander Belopolsky40018472011-02-26 01:02:56 +00003349PyObject *
3350PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003351 Py_ssize_t size,
3352 const char *encoding,
3353 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003354{
3355 PyObject *buffer = NULL, *unicode;
3356 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003357 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3358
Victor Stinner22eb6892019-06-26 00:51:05 +02003359 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3360 return NULL;
3361 }
3362
Victor Stinnered076ed2019-06-26 01:49:32 +02003363 if (size == 0) {
3364 _Py_RETURN_UNICODE_EMPTY();
3365 }
3366
Victor Stinner942889a2016-09-05 15:40:10 -07003367 if (encoding == NULL) {
3368 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3369 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003370
Fred Drakee4315f52000-05-09 19:53:39 +00003371 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003372 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3373 char *lower = buflower;
3374
3375 /* Fast paths */
3376 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3377 lower += 3;
3378 if (*lower == '_') {
3379 /* Match "utf8" and "utf_8" */
3380 lower++;
3381 }
3382
3383 if (lower[0] == '8' && lower[1] == 0) {
3384 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3385 }
3386 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3387 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3388 }
3389 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3390 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3391 }
3392 }
3393 else {
3394 if (strcmp(lower, "ascii") == 0
3395 || strcmp(lower, "us_ascii") == 0) {
3396 return PyUnicode_DecodeASCII(s, size, errors);
3397 }
Steve Dowercc16be82016-09-08 10:35:16 -07003398 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003399 else if (strcmp(lower, "mbcs") == 0) {
3400 return PyUnicode_DecodeMBCS(s, size, errors);
3401 }
3402 #endif
3403 else if (strcmp(lower, "latin1") == 0
3404 || strcmp(lower, "latin_1") == 0
3405 || strcmp(lower, "iso_8859_1") == 0
3406 || strcmp(lower, "iso8859_1") == 0) {
3407 return PyUnicode_DecodeLatin1(s, size, errors);
3408 }
3409 }
Victor Stinner37296e82010-06-10 13:36:23 +00003410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411
3412 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003413 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003414 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003415 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003416 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 if (buffer == NULL)
3418 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003419 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 if (unicode == NULL)
3421 goto onError;
3422 if (!PyUnicode_Check(unicode)) {
3423 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003424 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003425 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003426 encoding,
3427 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 Py_DECREF(unicode);
3429 goto onError;
3430 }
3431 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003432 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003433
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 Py_XDECREF(buffer);
3436 return NULL;
3437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003443{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003444 if (!PyUnicode_Check(unicode)) {
3445 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003446 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003447 }
3448
Serhiy Storchaka00939072016-10-27 21:05:49 +03003449 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3450 "PyUnicode_AsDecodedObject() is deprecated; "
3451 "use PyCodec_Decode() to decode from str", 1) < 0)
3452 return NULL;
3453
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003454 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456
3457 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003458 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003459}
3460
Alexander Belopolsky40018472011-02-26 01:02:56 +00003461PyObject *
3462PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003463 const char *encoding,
3464 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003465{
3466 PyObject *v;
3467
3468 if (!PyUnicode_Check(unicode)) {
3469 PyErr_BadArgument();
3470 goto onError;
3471 }
3472
Serhiy Storchaka00939072016-10-27 21:05:49 +03003473 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3474 "PyUnicode_AsDecodedUnicode() is deprecated; "
3475 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3476 return NULL;
3477
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003478 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003480
3481 /* Decode via the codec registry */
3482 v = PyCodec_Decode(unicode, encoding, errors);
3483 if (v == NULL)
3484 goto onError;
3485 if (!PyUnicode_Check(v)) {
3486 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003487 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003488 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003489 encoding,
3490 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003491 Py_DECREF(v);
3492 goto onError;
3493 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003494 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003495
Benjamin Peterson29060642009-01-31 22:14:21 +00003496 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003497 return NULL;
3498}
3499
Alexander Belopolsky40018472011-02-26 01:02:56 +00003500PyObject *
3501PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003502 Py_ssize_t size,
3503 const char *encoding,
3504 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505{
3506 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003507
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003508 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3512 Py_DECREF(unicode);
3513 return v;
3514}
3515
Alexander Belopolsky40018472011-02-26 01:02:56 +00003516PyObject *
3517PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003518 const char *encoding,
3519 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003520{
3521 PyObject *v;
3522
3523 if (!PyUnicode_Check(unicode)) {
3524 PyErr_BadArgument();
3525 goto onError;
3526 }
3527
Serhiy Storchaka00939072016-10-27 21:05:49 +03003528 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3529 "PyUnicode_AsEncodedObject() is deprecated; "
3530 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3531 "or PyCodec_Encode() for generic encoding", 1) < 0)
3532 return NULL;
3533
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003536
3537 /* Encode via the codec registry */
3538 v = PyCodec_Encode(unicode, encoding, errors);
3539 if (v == NULL)
3540 goto onError;
3541 return v;
3542
Benjamin Peterson29060642009-01-31 22:14:21 +00003543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003544 return NULL;
3545}
3546
Victor Stinner1b579672011-12-17 05:47:23 +01003547
Victor Stinner2cba6b82018-01-10 22:46:15 +01003548static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003549unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003550 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003551{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003552 Py_ssize_t wlen;
3553 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3554 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003555 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003556 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003557
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003558 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003559 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003560 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003561 return NULL;
3562 }
3563
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003564 char *str;
3565 size_t error_pos;
3566 const char *reason;
3567 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003568 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003569 PyMem_Free(wstr);
3570
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003571 if (res != 0) {
3572 if (res == -2) {
3573 PyObject *exc;
3574 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3575 "locale", unicode,
3576 (Py_ssize_t)error_pos,
3577 (Py_ssize_t)(error_pos+1),
3578 reason);
3579 if (exc != NULL) {
3580 PyCodec_StrictErrors(exc);
3581 Py_DECREF(exc);
3582 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003583 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003584 else if (res == -3) {
3585 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3586 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003587 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003588 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003589 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003590 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003591 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003592
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003593 PyObject *bytes = PyBytes_FromString(str);
3594 PyMem_RawFree(str);
3595 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003596}
3597
Victor Stinnerad158722010-10-27 00:25:46 +00003598PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003599PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3600{
Victor Stinner709d23d2019-05-02 14:56:30 -04003601 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3602 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003603}
3604
3605PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003606PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003607{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003608 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003609 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003610 return unicode_encode_utf8(unicode,
3611 interp->fs_codec.error_handler,
3612 interp->fs_codec.errors);
3613 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003614#ifndef _Py_FORCE_UTF8_FS_ENCODING
3615 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003616 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003617 interp->fs_codec.encoding,
3618 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003619 }
Victor Stinnerad158722010-10-27 00:25:46 +00003620#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003621 else {
3622 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3623 machinery is not ready and so cannot be used:
3624 use wcstombs() in this case. */
3625 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3626 assert(filesystem_errors != NULL);
3627 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3628 assert(errors != _Py_ERROR_UNKNOWN);
3629#ifdef _Py_FORCE_UTF8_FS_ENCODING
3630 return unicode_encode_utf8(unicode, errors, NULL);
3631#else
3632 return unicode_encode_locale(unicode, errors, 0);
3633#endif
3634 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003635}
3636
Alexander Belopolsky40018472011-02-26 01:02:56 +00003637PyObject *
3638PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003639 const char *encoding,
3640 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641{
3642 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003643 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 if (!PyUnicode_Check(unicode)) {
3646 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 }
Fred Drakee4315f52000-05-09 19:53:39 +00003649
Victor Stinner22eb6892019-06-26 00:51:05 +02003650 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3651 return NULL;
3652 }
3653
Victor Stinner942889a2016-09-05 15:40:10 -07003654 if (encoding == NULL) {
3655 return _PyUnicode_AsUTF8String(unicode, errors);
3656 }
3657
Fred Drakee4315f52000-05-09 19:53:39 +00003658 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003659 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3660 char *lower = buflower;
3661
3662 /* Fast paths */
3663 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3664 lower += 3;
3665 if (*lower == '_') {
3666 /* Match "utf8" and "utf_8" */
3667 lower++;
3668 }
3669
3670 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003671 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003672 }
3673 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3674 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3675 }
3676 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3677 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3678 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003679 }
Victor Stinner942889a2016-09-05 15:40:10 -07003680 else {
3681 if (strcmp(lower, "ascii") == 0
3682 || strcmp(lower, "us_ascii") == 0) {
3683 return _PyUnicode_AsASCIIString(unicode, errors);
3684 }
Steve Dowercc16be82016-09-08 10:35:16 -07003685#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003686 else if (strcmp(lower, "mbcs") == 0) {
3687 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3688 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003689#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003690 else if (strcmp(lower, "latin1") == 0 ||
3691 strcmp(lower, "latin_1") == 0 ||
3692 strcmp(lower, "iso_8859_1") == 0 ||
3693 strcmp(lower, "iso8859_1") == 0) {
3694 return _PyUnicode_AsLatin1String(unicode, errors);
3695 }
3696 }
Victor Stinner37296e82010-06-10 13:36:23 +00003697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698
3699 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003700 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003702 return NULL;
3703
3704 /* The normal path */
3705 if (PyBytes_Check(v))
3706 return v;
3707
3708 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003709 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003710 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003711 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003712
3713 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003714 "encoder %s returned bytearray instead of bytes; "
3715 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003716 encoding);
3717 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003718 Py_DECREF(v);
3719 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003720 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003721
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003722 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3723 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003724 Py_DECREF(v);
3725 return b;
3726 }
3727
3728 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003729 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003730 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003731 encoding,
3732 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003733 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003734 return NULL;
3735}
3736
Alexander Belopolsky40018472011-02-26 01:02:56 +00003737PyObject *
3738PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003739 const char *encoding,
3740 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003741{
3742 PyObject *v;
3743
3744 if (!PyUnicode_Check(unicode)) {
3745 PyErr_BadArgument();
3746 goto onError;
3747 }
3748
Serhiy Storchaka00939072016-10-27 21:05:49 +03003749 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3750 "PyUnicode_AsEncodedUnicode() is deprecated; "
3751 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3752 return NULL;
3753
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003754 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003756
3757 /* Encode via the codec registry */
3758 v = PyCodec_Encode(unicode, encoding, errors);
3759 if (v == NULL)
3760 goto onError;
3761 if (!PyUnicode_Check(v)) {
3762 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003763 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003764 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003765 encoding,
3766 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003767 Py_DECREF(v);
3768 goto onError;
3769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003771
Benjamin Peterson29060642009-01-31 22:14:21 +00003772 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 return NULL;
3774}
3775
Victor Stinner2cba6b82018-01-10 22:46:15 +01003776static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003777unicode_decode_locale(const char *str, Py_ssize_t len,
3778 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003779{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003780 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3781 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003782 return NULL;
3783 }
3784
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003785 wchar_t *wstr;
3786 size_t wlen;
3787 const char *reason;
3788 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003789 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003790 if (res != 0) {
3791 if (res == -2) {
3792 PyObject *exc;
3793 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3794 "locale", str, len,
3795 (Py_ssize_t)wlen,
3796 (Py_ssize_t)(wlen + 1),
3797 reason);
3798 if (exc != NULL) {
3799 PyCodec_StrictErrors(exc);
3800 Py_DECREF(exc);
3801 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003802 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003803 else if (res == -3) {
3804 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3805 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003806 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003807 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003808 }
Victor Stinner2f197072011-12-17 07:08:30 +01003809 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003810 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003811
3812 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3813 PyMem_RawFree(wstr);
3814 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003815}
3816
3817PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003818PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3819 const char *errors)
3820{
Victor Stinner709d23d2019-05-02 14:56:30 -04003821 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3822 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003823}
3824
3825PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003826PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003827{
3828 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003829 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3830 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003831}
3832
3833
3834PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003835PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003836 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003837 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3838}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003839
Christian Heimes5894ba72007-11-04 11:43:14 +00003840PyObject*
3841PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3842{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003843 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003844 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003845 return unicode_decode_utf8(s, size,
3846 interp->fs_codec.error_handler,
3847 interp->fs_codec.errors,
3848 NULL);
3849 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003850#ifndef _Py_FORCE_UTF8_FS_ENCODING
3851 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003852 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003853 interp->fs_codec.encoding,
3854 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003855 }
Victor Stinnerad158722010-10-27 00:25:46 +00003856#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003857 else {
3858 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3859 machinery is not ready and so cannot be used:
3860 use mbstowcs() in this case. */
3861 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3862 assert(filesystem_errors != NULL);
3863 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3864 assert(errors != _Py_ERROR_UNKNOWN);
3865#ifdef _Py_FORCE_UTF8_FS_ENCODING
3866 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3867#else
3868 return unicode_decode_locale(s, size, errors, 0);
3869#endif
3870 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003871}
3872
Martin v. Löwis011e8422009-05-05 04:43:17 +00003873
3874int
3875PyUnicode_FSConverter(PyObject* arg, void* addr)
3876{
Brett Cannonec6ce872016-09-06 15:50:29 -07003877 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003878 PyObject *output = NULL;
3879 Py_ssize_t size;
3880 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003881 if (arg == NULL) {
3882 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003883 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003884 return 1;
3885 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003886 path = PyOS_FSPath(arg);
3887 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003888 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003889 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003890 if (PyBytes_Check(path)) {
3891 output = path;
3892 }
3893 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3894 output = PyUnicode_EncodeFSDefault(path);
3895 Py_DECREF(path);
3896 if (!output) {
3897 return 0;
3898 }
3899 assert(PyBytes_Check(output));
3900 }
3901
Victor Stinner0ea2a462010-04-30 00:22:08 +00003902 size = PyBytes_GET_SIZE(output);
3903 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003904 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003905 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003906 Py_DECREF(output);
3907 return 0;
3908 }
3909 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003910 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003911}
3912
3913
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003914int
3915PyUnicode_FSDecoder(PyObject* arg, void* addr)
3916{
Brett Cannona5711202016-09-06 19:36:01 -07003917 int is_buffer = 0;
3918 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003919 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003920 if (arg == NULL) {
3921 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003922 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003923 return 1;
3924 }
Brett Cannona5711202016-09-06 19:36:01 -07003925
3926 is_buffer = PyObject_CheckBuffer(arg);
3927 if (!is_buffer) {
3928 path = PyOS_FSPath(arg);
3929 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003930 return 0;
3931 }
Brett Cannona5711202016-09-06 19:36:01 -07003932 }
3933 else {
3934 path = arg;
3935 Py_INCREF(arg);
3936 }
3937
3938 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003939 output = path;
3940 }
3941 else if (PyBytes_Check(path) || is_buffer) {
3942 PyObject *path_bytes = NULL;
3943
3944 if (!PyBytes_Check(path) &&
3945 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003946 "path should be string, bytes, or os.PathLike, not %.200s",
3947 Py_TYPE(arg)->tp_name)) {
3948 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003949 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003950 }
3951 path_bytes = PyBytes_FromObject(path);
3952 Py_DECREF(path);
3953 if (!path_bytes) {
3954 return 0;
3955 }
3956 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3957 PyBytes_GET_SIZE(path_bytes));
3958 Py_DECREF(path_bytes);
3959 if (!output) {
3960 return 0;
3961 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003962 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003963 else {
3964 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003965 "path should be string, bytes, or os.PathLike, not %.200s",
3966 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003967 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003968 return 0;
3969 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003970 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003971 Py_DECREF(output);
3972 return 0;
3973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003975 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003976 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003977 Py_DECREF(output);
3978 return 0;
3979 }
3980 *(PyObject**)addr = output;
3981 return Py_CLEANUP_SUPPORTED;
3982}
3983
3984
Inada Naoki02a4d572020-02-27 13:48:59 +09003985static int unicode_fill_utf8(PyObject *unicode);
3986
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003987const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003989{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003990 if (!PyUnicode_Check(unicode)) {
3991 PyErr_BadArgument();
3992 return NULL;
3993 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003994 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003995 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003997 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09003998 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 return NULL;
4000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 }
4002
4003 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004004 *psize = PyUnicode_UTF8_LENGTH(unicode);
4005 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004006}
4007
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004008const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4012}
4013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014Py_UNICODE *
4015PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004017 if (!PyUnicode_Check(unicode)) {
4018 PyErr_BadArgument();
4019 return NULL;
4020 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004021 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4022 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004024 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004025 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026
Serhiy Storchakac46db922018-10-23 22:58:24 +03004027 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4028 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4029 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004032 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4033 if (w == NULL) {
4034 PyErr_NoMemory();
4035 return NULL;
4036 }
4037 unicode_copy_as_widechar(unicode, w, wlen + 1);
4038 _PyUnicode_WSTR(unicode) = w;
4039 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4040 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 }
4042 }
4043 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004044 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004045 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004046}
4047
Alexander Belopolsky40018472011-02-26 01:02:56 +00004048Py_UNICODE *
4049PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052}
4053
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004054const Py_UNICODE *
4055_PyUnicode_AsUnicode(PyObject *unicode)
4056{
4057 Py_ssize_t size;
4058 const Py_UNICODE *wstr;
4059
4060 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4061 if (wstr && wcslen(wstr) != (size_t)size) {
4062 PyErr_SetString(PyExc_ValueError, "embedded null character");
4063 return NULL;
4064 }
4065 return wstr;
4066}
4067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068
Alexander Belopolsky40018472011-02-26 01:02:56 +00004069Py_ssize_t
4070PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071{
4072 if (!PyUnicode_Check(unicode)) {
4073 PyErr_BadArgument();
4074 goto onError;
4075 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004076 if (_PyUnicode_WSTR(unicode) == NULL) {
4077 if (PyUnicode_AsUnicode(unicode) == NULL)
4078 goto onError;
4079 }
4080 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 return -1;
4084}
4085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086Py_ssize_t
4087PyUnicode_GetLength(PyObject *unicode)
4088{
Victor Stinner07621332012-06-16 04:53:46 +02004089 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004090 PyErr_BadArgument();
4091 return -1;
4092 }
Victor Stinner07621332012-06-16 04:53:46 +02004093 if (PyUnicode_READY(unicode) == -1)
4094 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 return PyUnicode_GET_LENGTH(unicode);
4096}
4097
4098Py_UCS4
4099PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4100{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004101 void *data;
4102 int kind;
4103
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004104 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004105 PyErr_BadArgument();
4106 return (Py_UCS4)-1;
4107 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004108 if (PyUnicode_READY(unicode) == -1) {
4109 return (Py_UCS4)-1;
4110 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004111 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004112 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113 return (Py_UCS4)-1;
4114 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004115 data = PyUnicode_DATA(unicode);
4116 kind = PyUnicode_KIND(unicode);
4117 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118}
4119
4120int
4121PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4122{
4123 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004124 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 return -1;
4126 }
Victor Stinner488fa492011-12-12 00:01:39 +01004127 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004128 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004129 PyErr_SetString(PyExc_IndexError, "string index out of range");
4130 return -1;
4131 }
Victor Stinner488fa492011-12-12 00:01:39 +01004132 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004133 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004134 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4135 PyErr_SetString(PyExc_ValueError, "character out of range");
4136 return -1;
4137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4139 index, ch);
4140 return 0;
4141}
4142
Alexander Belopolsky40018472011-02-26 01:02:56 +00004143const char *
4144PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004145{
Victor Stinner42cb4622010-09-01 19:39:01 +00004146 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004147}
4148
Victor Stinner554f3f02010-06-16 23:33:54 +00004149/* create or adjust a UnicodeDecodeError */
4150static void
4151make_decode_exception(PyObject **exceptionObject,
4152 const char *encoding,
4153 const char *input, Py_ssize_t length,
4154 Py_ssize_t startpos, Py_ssize_t endpos,
4155 const char *reason)
4156{
4157 if (*exceptionObject == NULL) {
4158 *exceptionObject = PyUnicodeDecodeError_Create(
4159 encoding, input, length, startpos, endpos, reason);
4160 }
4161 else {
4162 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4163 goto onError;
4164 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4165 goto onError;
4166 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4167 goto onError;
4168 }
4169 return;
4170
4171onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004172 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004173}
4174
Steve Dowercc16be82016-09-08 10:35:16 -07004175#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004176static int
4177widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4178{
4179 if (newsize > *size) {
4180 wchar_t *newbuf = *buf;
4181 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4182 PyErr_NoMemory();
4183 return -1;
4184 }
4185 *buf = newbuf;
4186 }
4187 *size = newsize;
4188 return 0;
4189}
4190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191/* error handling callback helper:
4192 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004193 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 and adjust various state variables.
4195 return 0 on success, -1 on error
4196*/
4197
Alexander Belopolsky40018472011-02-26 01:02:56 +00004198static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004199unicode_decode_call_errorhandler_wchar(
4200 const char *errors, PyObject **errorHandler,
4201 const char *encoding, const char *reason,
4202 const char **input, const char **inend, Py_ssize_t *startinpos,
4203 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004204 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004206 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207
4208 PyObject *restuple = NULL;
4209 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004210 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004211 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004212 Py_ssize_t requiredsize;
4213 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004214 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004215 wchar_t *repwstr;
4216 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217
4218 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004219 *errorHandler = PyCodec_LookupError(errors);
4220 if (*errorHandler == NULL)
4221 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004222 }
4223
Victor Stinner554f3f02010-06-16 23:33:54 +00004224 make_decode_exception(exceptionObject,
4225 encoding,
4226 *input, *inend - *input,
4227 *startinpos, *endinpos,
4228 reason);
4229 if (*exceptionObject == NULL)
4230 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231
Petr Viktorinffd97532020-02-11 17:46:57 +01004232 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004236 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004239 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241
4242 /* Copy back the bytes variables, which might have been modified by the
4243 callback */
4244 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4245 if (!inputobj)
4246 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004247 *input = PyBytes_AS_STRING(inputobj);
4248 insize = PyBytes_GET_SIZE(inputobj);
4249 *inend = *input + insize;
4250 /* we can DECREF safely, as the exception has another reference,
4251 so the object won't go away. */
4252 Py_DECREF(inputobj);
4253
4254 if (newpos<0)
4255 newpos = insize+newpos;
4256 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004257 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258 goto onError;
4259 }
4260
4261 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4262 if (repwstr == NULL)
4263 goto onError;
4264 /* need more space? (at least enough for what we
4265 have+the replacement+the rest of the string (starting
4266 at the new input position), so we won't have to check space
4267 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004268 requiredsize = *outpos;
4269 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4270 goto overflow;
4271 requiredsize += repwlen;
4272 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4273 goto overflow;
4274 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004275 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004276 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004277 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004278 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004279 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004280 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004281 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004283 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004284 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004285 *endinpos = newpos;
4286 *inptr = *input + newpos;
4287
4288 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004289 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290 return 0;
4291
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004292 overflow:
4293 PyErr_SetString(PyExc_OverflowError,
4294 "decoded result is too long for a Python string");
4295
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 onError:
4297 Py_XDECREF(restuple);
4298 return -1;
4299}
Steve Dowercc16be82016-09-08 10:35:16 -07004300#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301
4302static int
4303unicode_decode_call_errorhandler_writer(
4304 const char *errors, PyObject **errorHandler,
4305 const char *encoding, const char *reason,
4306 const char **input, const char **inend, Py_ssize_t *startinpos,
4307 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4308 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4309{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004310 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311
4312 PyObject *restuple = NULL;
4313 PyObject *repunicode = NULL;
4314 Py_ssize_t insize;
4315 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004316 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004317 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004318 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004319 int need_to_grow = 0;
4320 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321
4322 if (*errorHandler == NULL) {
4323 *errorHandler = PyCodec_LookupError(errors);
4324 if (*errorHandler == NULL)
4325 goto onError;
4326 }
4327
4328 make_decode_exception(exceptionObject,
4329 encoding,
4330 *input, *inend - *input,
4331 *startinpos, *endinpos,
4332 reason);
4333 if (*exceptionObject == NULL)
4334 goto onError;
4335
Petr Viktorinffd97532020-02-11 17:46:57 +01004336 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 if (restuple == NULL)
4338 goto onError;
4339 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004340 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 goto onError;
4342 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004343 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004344 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004345
4346 /* Copy back the bytes variables, which might have been modified by the
4347 callback */
4348 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4349 if (!inputobj)
4350 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004351 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004352 *input = PyBytes_AS_STRING(inputobj);
4353 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004354 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004355 /* we can DECREF safely, as the exception has another reference,
4356 so the object won't go away. */
4357 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004360 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004361 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004362 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004364 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004365
Victor Stinner170ca6f2013-04-18 00:25:28 +02004366 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004367 if (replen > 1) {
4368 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004369 need_to_grow = 1;
4370 }
4371 new_inptr = *input + newpos;
4372 if (*inend - new_inptr > remain) {
4373 /* We don't know the decoding algorithm here so we make the worst
4374 assumption that one byte decodes to one unicode character.
4375 If unfortunately one byte could decode to more unicode characters,
4376 the decoder may write out-of-bound then. Is it possible for the
4377 algorithms using this function? */
4378 writer->min_length += *inend - new_inptr - remain;
4379 need_to_grow = 1;
4380 }
4381 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004382 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004383 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004384 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4385 goto onError;
4386 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004387 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004388 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004389
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004391 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004394 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400}
4401
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004402/* --- UTF-7 Codec -------------------------------------------------------- */
4403
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404/* See RFC2152 for details. We encode conservatively and decode liberally. */
4405
4406/* Three simple macros defining base-64. */
4407
4408/* Is c a base-64 character? */
4409
4410#define IS_BASE64(c) \
4411 (((c) >= 'A' && (c) <= 'Z') || \
4412 ((c) >= 'a' && (c) <= 'z') || \
4413 ((c) >= '0' && (c) <= '9') || \
4414 (c) == '+' || (c) == '/')
4415
4416/* given that c is a base-64 character, what is its base-64 value? */
4417
4418#define FROM_BASE64(c) \
4419 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4420 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4421 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4422 (c) == '+' ? 62 : 63)
4423
4424/* What is the base-64 character of the bottom 6 bits of n? */
4425
4426#define TO_BASE64(n) \
4427 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4428
4429/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4430 * decoded as itself. We are permissive on decoding; the only ASCII
4431 * byte not decoding to itself is the + which begins a base64
4432 * string. */
4433
4434#define DECODE_DIRECT(c) \
4435 ((c) <= 127 && (c) != '+')
4436
4437/* The UTF-7 encoder treats ASCII characters differently according to
4438 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4439 * the above). See RFC2152. This array identifies these different
4440 * sets:
4441 * 0 : "Set D"
4442 * alphanumeric and '(),-./:?
4443 * 1 : "Set O"
4444 * !"#$%&*;<=>@[]^_`{|}
4445 * 2 : "whitespace"
4446 * ht nl cr sp
4447 * 3 : special (must be base64 encoded)
4448 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4449 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450
Tim Petersced69f82003-09-16 20:30:58 +00004451static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452char utf7_category[128] = {
4453/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4454 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4455/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4456 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4457/* sp ! " # $ % & ' ( ) * + , - . / */
4458 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4459/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4461/* @ A B C D E F G H I J K L M N O */
4462 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4463/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4465/* ` a b c d e f g h i j k l m n o */
4466 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4467/* p q r s t u v w x y z { | } ~ del */
4468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469};
4470
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471/* ENCODE_DIRECT: this character should be encoded as itself. The
4472 * answer depends on whether we are encoding set O as itself, and also
4473 * on whether we are encoding whitespace as itself. RFC2152 makes it
4474 * clear that the answers to these questions vary between
4475 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004476
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477#define ENCODE_DIRECT(c, directO, directWS) \
4478 ((c) < 128 && (c) > 0 && \
4479 ((utf7_category[(c)] == 0) || \
4480 (directWS && (utf7_category[(c)] == 2)) || \
4481 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004482
Alexander Belopolsky40018472011-02-26 01:02:56 +00004483PyObject *
4484PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004485 Py_ssize_t size,
4486 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004488 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4489}
4490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491/* The decoder. The only state we preserve is our read position,
4492 * i.e. how many characters we have consumed. So if we end in the
4493 * middle of a shift sequence we have to back off the read position
4494 * and the output to the beginning of the sequence, otherwise we lose
4495 * all the shift state (seen bits, number of bits seen, high
4496 * surrogate). */
4497
Alexander Belopolsky40018472011-02-26 01:02:56 +00004498PyObject *
4499PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004500 Py_ssize_t size,
4501 const char *errors,
4502 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004503{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004505 Py_ssize_t startinpos;
4506 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004508 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004509 const char *errmsg = "";
4510 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004511 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 unsigned int base64bits = 0;
4513 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004514 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 PyObject *errorHandler = NULL;
4516 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004518 if (size == 0) {
4519 if (consumed)
4520 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004521 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004525 _PyUnicodeWriter_Init(&writer);
4526 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004527
4528 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529 e = s + size;
4530
4531 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004532 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004534 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 if (inShift) { /* in a base-64 section */
4537 if (IS_BASE64(ch)) { /* consume a base-64 character */
4538 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4539 base64bits += 6;
4540 s++;
4541 if (base64bits >= 16) {
4542 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004543 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 base64bits -= 16;
4545 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004546 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 if (surrogate) {
4548 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004549 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4550 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004551 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004552 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004554 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 }
4556 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004557 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004558 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 }
4561 }
Victor Stinner551ac952011-11-29 22:58:13 +01004562 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 /* first surrogate */
4564 surrogate = outCh;
4565 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004567 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004568 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 }
4570 }
4571 }
4572 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 if (base64bits > 0) { /* left-over bits */
4575 if (base64bits >= 6) {
4576 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004577 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 errmsg = "partial character in shift sequence";
4579 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 else {
4582 /* Some bits remain; they should be zero */
4583 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004584 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 errmsg = "non-zero padding bits in shift sequence";
4586 goto utf7Error;
4587 }
4588 }
4589 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004590 if (surrogate && DECODE_DIRECT(ch)) {
4591 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4592 goto onError;
4593 }
4594 surrogate = 0;
4595 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 /* '-' is absorbed; other terminating
4597 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004598 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004600 }
4601 }
4602 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 s++; /* consume '+' */
4605 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004607 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004608 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004610 else if (s < e && !IS_BASE64(*s)) {
4611 s++;
4612 errmsg = "ill-formed sequence";
4613 goto utf7Error;
4614 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004616 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004617 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004618 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004620 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 }
4622 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004625 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004626 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004627 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 else {
4629 startinpos = s-starts;
4630 s++;
4631 errmsg = "unexpected special character";
4632 goto utf7Error;
4633 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004637 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 errors, &errorHandler,
4639 "utf7", errmsg,
4640 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004641 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 }
4644
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 /* end of string */
4646
4647 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4648 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004649 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004650 if (surrogate ||
4651 (base64bits >= 6) ||
4652 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004654 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004655 errors, &errorHandler,
4656 "utf7", "unterminated shift sequence",
4657 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659 goto onError;
4660 if (s < e)
4661 goto restart;
4662 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664
4665 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004666 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004667 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004668 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004669 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004670 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004671 writer.kind, writer.data, shiftOutStart);
4672 Py_XDECREF(errorHandler);
4673 Py_XDECREF(exc);
4674 _PyUnicodeWriter_Dealloc(&writer);
4675 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004676 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004677 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 }
4679 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004680 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004682 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004684 Py_XDECREF(errorHandler);
4685 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 Py_XDECREF(errorHandler);
4690 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004691 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004692 return NULL;
4693}
4694
4695
Alexander Belopolsky40018472011-02-26 01:02:56 +00004696PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004697_PyUnicode_EncodeUTF7(PyObject *str,
4698 int base64SetO,
4699 int base64WhiteSpace,
4700 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004702 int kind;
4703 void *data;
4704 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004705 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004707 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 unsigned int base64bits = 0;
4709 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004710 char * out;
4711 char * start;
4712
Benjamin Petersonbac79492012-01-14 13:34:47 -05004713 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004714 return NULL;
4715 kind = PyUnicode_KIND(str);
4716 data = PyUnicode_DATA(str);
4717 len = PyUnicode_GET_LENGTH(str);
4718
4719 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004722 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004723 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004724 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004725 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004726 if (v == NULL)
4727 return NULL;
4728
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004729 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004730 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004731 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004732
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 if (inShift) {
4734 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4735 /* shifting out */
4736 if (base64bits) { /* output remaining bits */
4737 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4738 base64buffer = 0;
4739 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740 }
4741 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 /* Characters not in the BASE64 set implicitly unshift the sequence
4743 so no '-' is required, except if the character is itself a '-' */
4744 if (IS_BASE64(ch) || ch == '-') {
4745 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004747 *out++ = (char) ch;
4748 }
4749 else {
4750 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004751 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004753 else { /* not in a shift sequence */
4754 if (ch == '+') {
4755 *out++ = '+';
4756 *out++ = '-';
4757 }
4758 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4759 *out++ = (char) ch;
4760 }
4761 else {
4762 *out++ = '+';
4763 inShift = 1;
4764 goto encode_char;
4765 }
4766 }
4767 continue;
4768encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004770 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004771
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 /* code first surrogate */
4773 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004774 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 while (base64bits >= 6) {
4776 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4777 base64bits -= 6;
4778 }
4779 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004780 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 base64bits += 16;
4783 base64buffer = (base64buffer << 16) | ch;
4784 while (base64bits >= 6) {
4785 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4786 base64bits -= 6;
4787 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004788 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004789 if (base64bits)
4790 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4791 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004792 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004793 if (_PyBytes_Resize(&v, out - start) < 0)
4794 return NULL;
4795 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004796}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004797PyObject *
4798PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4799 Py_ssize_t size,
4800 int base64SetO,
4801 int base64WhiteSpace,
4802 const char *errors)
4803{
4804 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004805 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004806 if (tmp == NULL)
4807 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004808 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004809 base64WhiteSpace, errors);
4810 Py_DECREF(tmp);
4811 return result;
4812}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814#undef IS_BASE64
4815#undef FROM_BASE64
4816#undef TO_BASE64
4817#undef DECODE_DIRECT
4818#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004819
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820/* --- UTF-8 Codec -------------------------------------------------------- */
4821
Alexander Belopolsky40018472011-02-26 01:02:56 +00004822PyObject *
4823PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004824 Py_ssize_t size,
4825 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826{
Walter Dörwald69652032004-09-07 20:24:22 +00004827 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4828}
4829
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830#include "stringlib/asciilib.h"
4831#include "stringlib/codecs.h"
4832#include "stringlib/undef.h"
4833
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004834#include "stringlib/ucs1lib.h"
4835#include "stringlib/codecs.h"
4836#include "stringlib/undef.h"
4837
4838#include "stringlib/ucs2lib.h"
4839#include "stringlib/codecs.h"
4840#include "stringlib/undef.h"
4841
4842#include "stringlib/ucs4lib.h"
4843#include "stringlib/codecs.h"
4844#include "stringlib/undef.h"
4845
Antoine Pitrouab868312009-01-10 15:40:25 +00004846/* Mask to quickly check whether a C 'long' contains a
4847 non-ASCII, UTF8-encoded char. */
4848#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004849# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004850#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004851# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004852#else
4853# error C 'long' size should be either 4 or 8!
4854#endif
4855
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856static Py_ssize_t
4857ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004858{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004860 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004861
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004862 /*
4863 * Issue #17237: m68k is a bit different from most architectures in
4864 * that objects do not use "natural alignment" - for example, int and
4865 * long are only aligned at 2-byte boundaries. Therefore the assert()
4866 * won't work; also, tests have shown that skipping the "optimised
4867 * version" will even speed up m68k.
4868 */
4869#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004870#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004871 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4872 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004873 /* Fast path, see in STRINGLIB(utf8_decode) for
4874 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004875 /* Help allocation */
4876 const char *_p = p;
4877 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 while (_p < aligned_end) {
4879 unsigned long value = *(const unsigned long *) _p;
4880 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 *((unsigned long *)q) = value;
4883 _p += SIZEOF_LONG;
4884 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004885 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004886 p = _p;
4887 while (p < end) {
4888 if ((unsigned char)*p & 0x80)
4889 break;
4890 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004895#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 while (p < end) {
4897 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4898 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004899 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004900 /* Help allocation */
4901 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004903 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904 if (value & ASCII_CHAR_MASK)
4905 break;
4906 _p += SIZEOF_LONG;
4907 }
4908 p = _p;
4909 if (_p == end)
4910 break;
4911 }
4912 if ((unsigned char)*p & 0x80)
4913 break;
4914 ++p;
4915 }
4916 memcpy(dest, start, p - start);
4917 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918}
Antoine Pitrouab868312009-01-10 15:40:25 +00004919
Victor Stinner709d23d2019-05-02 14:56:30 -04004920static PyObject *
4921unicode_decode_utf8(const char *s, Py_ssize_t size,
4922 _Py_error_handler error_handler, const char *errors,
4923 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004924{
Victor Stinner785938e2011-12-11 20:09:03 +01004925 if (size == 0) {
4926 if (consumed)
4927 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004928 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004929 }
4930
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004931 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4932 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004933 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004934 *consumed = 1;
4935 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004936 }
4937
Inada Naoki770847a2019-06-24 12:30:24 +09004938 const char *starts = s;
4939 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004940
Inada Naoki770847a2019-06-24 12:30:24 +09004941 // fast path: try ASCII string.
4942 PyObject *u = PyUnicode_New(size, 127);
4943 if (u == NULL) {
4944 return NULL;
4945 }
4946 s += ascii_decode(s, end, PyUnicode_DATA(u));
4947 if (s == end) {
4948 return u;
4949 }
4950
4951 // Use _PyUnicodeWriter after fast path is failed.
4952 _PyUnicodeWriter writer;
4953 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4954 writer.pos = s - starts;
4955
4956 Py_ssize_t startinpos, endinpos;
4957 const char *errmsg = "";
4958 PyObject *error_handler_obj = NULL;
4959 PyObject *exc = NULL;
4960
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 while (s < end) {
4962 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004963 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004964
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004966 if (PyUnicode_IS_ASCII(writer.buffer))
4967 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004969 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004971 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972 } else {
4973 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004974 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 }
4976
4977 switch (ch) {
4978 case 0:
4979 if (s == end || consumed)
4980 goto End;
4981 errmsg = "unexpected end of data";
4982 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004983 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 break;
4985 case 1:
4986 errmsg = "invalid start byte";
4987 startinpos = s - starts;
4988 endinpos = startinpos + 1;
4989 break;
4990 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03004991 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4992 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4993 {
4994 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004995 goto End;
4996 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03004997 /* fall through */
4998 case 3:
4999 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 errmsg = "invalid continuation byte";
5001 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005002 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 break;
5004 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005005 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 goto onError;
5007 continue;
5008 }
5009
Victor Stinner1d65d912015-10-05 13:43:50 +02005010 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005011 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005012
5013 switch (error_handler) {
5014 case _Py_ERROR_IGNORE:
5015 s += (endinpos - startinpos);
5016 break;
5017
5018 case _Py_ERROR_REPLACE:
5019 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5020 goto onError;
5021 s += (endinpos - startinpos);
5022 break;
5023
5024 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005025 {
5026 Py_ssize_t i;
5027
Victor Stinner1d65d912015-10-05 13:43:50 +02005028 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5029 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005030 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005031 ch = (Py_UCS4)(unsigned char)(starts[i]);
5032 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5033 ch + 0xdc00);
5034 writer.pos++;
5035 }
5036 s += (endinpos - startinpos);
5037 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005038 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005039
5040 default:
5041 if (unicode_decode_call_errorhandler_writer(
5042 errors, &error_handler_obj,
5043 "utf-8", errmsg,
5044 &starts, &end, &startinpos, &endinpos, &exc, &s,
5045 &writer))
5046 goto onError;
5047 }
Victor Stinner785938e2011-12-11 20:09:03 +01005048 }
5049
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005050End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005051 if (consumed)
5052 *consumed = s - starts;
5053
Victor Stinner1d65d912015-10-05 13:43:50 +02005054 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005055 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005056 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057
5058onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005059 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005061 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005063}
5064
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005065
Victor Stinner709d23d2019-05-02 14:56:30 -04005066PyObject *
5067PyUnicode_DecodeUTF8Stateful(const char *s,
5068 Py_ssize_t size,
5069 const char *errors,
5070 Py_ssize_t *consumed)
5071{
5072 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5073}
5074
5075
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5077 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005078
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005079 On success, write a pointer to a newly allocated wide character string into
5080 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5081 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005083 On memory allocation failure, return -1.
5084
5085 On decoding error (if surrogateescape is zero), return -2. If wlen is
5086 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5087 is not NULL, write the decoding error message into *reason. */
5088int
5089_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005090 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005091{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005092 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005093 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 wchar_t *unicode;
5095 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096
Victor Stinner3d4226a2018-08-29 22:21:32 +02005097 int surrogateescape = 0;
5098 int surrogatepass = 0;
5099 switch (errors)
5100 {
5101 case _Py_ERROR_STRICT:
5102 break;
5103 case _Py_ERROR_SURROGATEESCAPE:
5104 surrogateescape = 1;
5105 break;
5106 case _Py_ERROR_SURROGATEPASS:
5107 surrogatepass = 1;
5108 break;
5109 default:
5110 return -3;
5111 }
5112
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005113 /* Note: size will always be longer than the resulting Unicode
5114 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005115 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005116 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005117 }
5118
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005119 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005120 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005121 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005122 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123
5124 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005126 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005129#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005131#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005132 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005133#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005134 if (ch > 0xFF) {
5135#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005136 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005138 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005139 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005140 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5141 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5142#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005143 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005145 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005146 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005147 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005148
5149 if (surrogateescape) {
5150 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5151 }
5152 else {
5153 /* Is it a valid three-byte code? */
5154 if (surrogatepass
5155 && (e - s) >= 3
5156 && (s[0] & 0xf0) == 0xe0
5157 && (s[1] & 0xc0) == 0x80
5158 && (s[2] & 0xc0) == 0x80)
5159 {
5160 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5161 s += 3;
5162 unicode[outpos++] = ch;
5163 }
5164 else {
5165 PyMem_RawFree(unicode );
5166 if (reason != NULL) {
5167 switch (ch) {
5168 case 0:
5169 *reason = "unexpected end of data";
5170 break;
5171 case 1:
5172 *reason = "invalid start byte";
5173 break;
5174 /* 2, 3, 4 */
5175 default:
5176 *reason = "invalid continuation byte";
5177 break;
5178 }
5179 }
5180 if (wlen != NULL) {
5181 *wlen = s - orig_s;
5182 }
5183 return -2;
5184 }
5185 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005187 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005188 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005189 if (wlen) {
5190 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005191 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005192 *wstr = unicode;
5193 return 0;
5194}
5195
Victor Stinner5f9cf232019-03-19 01:46:25 +01005196
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005197wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005198_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5199 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005200{
5201 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005202 int res = _Py_DecodeUTF8Ex(arg, arglen,
5203 &wstr, wlen,
5204 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005206 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5207 assert(res != -3);
5208 if (wlen) {
5209 *wlen = (size_t)res;
5210 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005211 return NULL;
5212 }
5213 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005214}
5215
Antoine Pitrouab868312009-01-10 15:40:25 +00005216
Victor Stinnere47e6982017-12-21 15:45:16 +01005217/* UTF-8 encoder using the surrogateescape error handler .
5218
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005219 On success, return 0 and write the newly allocated character string (use
5220 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005221
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005222 On encoding failure, return -2 and write the position of the invalid
5223 surrogate character into *error_pos (if error_pos is set) and the decoding
5224 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005225
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005226 On memory allocation failure, return -1. */
5227int
5228_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005229 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005230{
5231 const Py_ssize_t max_char_size = 4;
5232 Py_ssize_t len = wcslen(text);
5233
5234 assert(len >= 0);
5235
Victor Stinner3d4226a2018-08-29 22:21:32 +02005236 int surrogateescape = 0;
5237 int surrogatepass = 0;
5238 switch (errors)
5239 {
5240 case _Py_ERROR_STRICT:
5241 break;
5242 case _Py_ERROR_SURROGATEESCAPE:
5243 surrogateescape = 1;
5244 break;
5245 case _Py_ERROR_SURROGATEPASS:
5246 surrogatepass = 1;
5247 break;
5248 default:
5249 return -3;
5250 }
5251
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005252 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5253 return -1;
5254 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005255 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005256 if (raw_malloc) {
5257 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005258 }
5259 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005260 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005261 }
5262 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005263 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005264 }
5265
5266 char *p = bytes;
5267 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005268 for (i = 0; i < len; ) {
5269 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005271 i++;
5272#if Py_UNICODE_SIZE == 2
5273 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5274 && i < len
5275 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5276 {
5277 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5278 i++;
5279 }
5280#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005281
5282 if (ch < 0x80) {
5283 /* Encode ASCII */
5284 *p++ = (char) ch;
5285
5286 }
5287 else if (ch < 0x0800) {
5288 /* Encode Latin-1 */
5289 *p++ = (char)(0xc0 | (ch >> 6));
5290 *p++ = (char)(0x80 | (ch & 0x3f));
5291 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005292 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005293 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005294 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005295 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005296 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005297 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005298 if (reason != NULL) {
5299 *reason = "encoding error";
5300 }
5301 if (raw_malloc) {
5302 PyMem_RawFree(bytes);
5303 }
5304 else {
5305 PyMem_Free(bytes);
5306 }
5307 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005308 }
5309 *p++ = (char)(ch & 0xff);
5310 }
5311 else if (ch < 0x10000) {
5312 *p++ = (char)(0xe0 | (ch >> 12));
5313 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5314 *p++ = (char)(0x80 | (ch & 0x3f));
5315 }
5316 else { /* ch >= 0x10000 */
5317 assert(ch <= MAX_UNICODE);
5318 /* Encode UCS4 Unicode ordinals */
5319 *p++ = (char)(0xf0 | (ch >> 18));
5320 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5321 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5322 *p++ = (char)(0x80 | (ch & 0x3f));
5323 }
5324 }
5325 *p++ = '\0';
5326
5327 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005328 char *bytes2;
5329 if (raw_malloc) {
5330 bytes2 = PyMem_RawRealloc(bytes, final_size);
5331 }
5332 else {
5333 bytes2 = PyMem_Realloc(bytes, final_size);
5334 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005335 if (bytes2 == NULL) {
5336 if (error_pos != NULL) {
5337 *error_pos = (size_t)-1;
5338 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005339 if (raw_malloc) {
5340 PyMem_RawFree(bytes);
5341 }
5342 else {
5343 PyMem_Free(bytes);
5344 }
5345 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005346 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005347 *str = bytes2;
5348 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005349}
5350
5351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005352/* Primary internal function which creates utf8 encoded bytes objects.
5353
5354 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005355 and allocate exactly as much space needed at the end. Else allocate the
5356 maximum possible needed (4 result bytes per Unicode character), and return
5357 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005358*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005359static PyObject *
5360unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5361 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005363 if (!PyUnicode_Check(unicode)) {
5364 PyErr_BadArgument();
5365 return NULL;
5366 }
5367
5368 if (PyUnicode_READY(unicode) == -1)
5369 return NULL;
5370
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005371 if (PyUnicode_UTF8(unicode))
5372 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5373 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005374
Inada Naoki02a4d572020-02-27 13:48:59 +09005375 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5376 void *data = PyUnicode_DATA(unicode);
5377 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5378
5379 _PyBytesWriter writer;
5380 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005381
Benjamin Petersonead6b532011-12-20 17:23:42 -06005382 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005383 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005384 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005385 case PyUnicode_1BYTE_KIND:
5386 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5387 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005388 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5389 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005390 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005391 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5392 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005393 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005394 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5395 break;
Tim Peters602f7402002-04-27 18:03:26 +00005396 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005397
5398 if (end == NULL) {
5399 _PyBytesWriter_Dealloc(&writer);
5400 return NULL;
5401 }
5402 return _PyBytesWriter_Finish(&writer, end);
5403}
5404
5405static int
5406unicode_fill_utf8(PyObject *unicode)
5407{
5408 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5409 assert(!PyUnicode_IS_ASCII(unicode));
5410
5411 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5412 void *data = PyUnicode_DATA(unicode);
5413 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5414
5415 _PyBytesWriter writer;
5416 char *end;
5417
5418 switch (kind) {
5419 default:
5420 Py_UNREACHABLE();
5421 case PyUnicode_1BYTE_KIND:
5422 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5423 _Py_ERROR_STRICT, NULL);
5424 break;
5425 case PyUnicode_2BYTE_KIND:
5426 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5427 _Py_ERROR_STRICT, NULL);
5428 break;
5429 case PyUnicode_4BYTE_KIND:
5430 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5431 _Py_ERROR_STRICT, NULL);
5432 break;
5433 }
5434 if (end == NULL) {
5435 _PyBytesWriter_Dealloc(&writer);
5436 return -1;
5437 }
5438
5439 char *start = writer.use_small_buffer ? writer.small_buffer :
5440 PyBytes_AS_STRING(writer.buffer);
5441 Py_ssize_t len = end - start;
5442
5443 char *cache = PyObject_MALLOC(len + 1);
5444 if (cache == NULL) {
5445 _PyBytesWriter_Dealloc(&writer);
5446 PyErr_NoMemory();
5447 return -1;
5448 }
5449 _PyUnicode_UTF8(unicode) = cache;
5450 _PyUnicode_UTF8_LENGTH(unicode) = len;
5451 memcpy(cache, start, len);
5452 cache[len] = '\0';
5453 _PyBytesWriter_Dealloc(&writer);
5454 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455}
5456
Alexander Belopolsky40018472011-02-26 01:02:56 +00005457PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005458_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5459{
5460 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5461}
5462
5463
5464PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005465PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5466 Py_ssize_t size,
5467 const char *errors)
5468{
5469 PyObject *v, *unicode;
5470
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005471 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005472 if (unicode == NULL)
5473 return NULL;
5474 v = _PyUnicode_AsUTF8String(unicode, errors);
5475 Py_DECREF(unicode);
5476 return v;
5477}
5478
5479PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005480PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483}
5484
Walter Dörwald41980ca2007-08-16 21:55:45 +00005485/* --- UTF-32 Codec ------------------------------------------------------- */
5486
5487PyObject *
5488PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 Py_ssize_t size,
5490 const char *errors,
5491 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005492{
5493 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5494}
5495
5496PyObject *
5497PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 Py_ssize_t size,
5499 const char *errors,
5500 int *byteorder,
5501 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005502{
5503 const char *starts = s;
5504 Py_ssize_t startinpos;
5505 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005506 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005507 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005508 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005509 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005510 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005511 PyObject *errorHandler = NULL;
5512 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005513
Andy Lestere6be9b52020-02-11 20:28:35 -06005514 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005515 e = q + size;
5516
5517 if (byteorder)
5518 bo = *byteorder;
5519
5520 /* Check for BOM marks (U+FEFF) in the input and adjust current
5521 byte order setting accordingly. In native mode, the leading BOM
5522 mark is skipped, in all other modes, it is copied to the output
5523 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005524 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005525 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005526 if (bom == 0x0000FEFF) {
5527 bo = -1;
5528 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005530 else if (bom == 0xFFFE0000) {
5531 bo = 1;
5532 q += 4;
5533 }
5534 if (byteorder)
5535 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005536 }
5537
Victor Stinnere64322e2012-10-30 23:12:47 +01005538 if (q == e) {
5539 if (consumed)
5540 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005541 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005542 }
5543
Victor Stinnere64322e2012-10-30 23:12:47 +01005544#ifdef WORDS_BIGENDIAN
5545 le = bo < 0;
5546#else
5547 le = bo <= 0;
5548#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005549 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005550
Victor Stinner8f674cc2013-04-17 23:02:17 +02005551 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005552 writer.min_length = (e - q + 3) / 4;
5553 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005554 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005555
Victor Stinnere64322e2012-10-30 23:12:47 +01005556 while (1) {
5557 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005558 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005559
Victor Stinnere64322e2012-10-30 23:12:47 +01005560 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005561 enum PyUnicode_Kind kind = writer.kind;
5562 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005563 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005564 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005565 if (le) {
5566 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005567 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005568 if (ch > maxch)
5569 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005570 if (kind != PyUnicode_1BYTE_KIND &&
5571 Py_UNICODE_IS_SURROGATE(ch))
5572 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 q += 4;
5575 } while (q <= last);
5576 }
5577 else {
5578 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005579 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005580 if (ch > maxch)
5581 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005582 if (kind != PyUnicode_1BYTE_KIND &&
5583 Py_UNICODE_IS_SURROGATE(ch))
5584 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005585 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005586 q += 4;
5587 } while (q <= last);
5588 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005589 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005590 }
5591
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005592 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005593 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 startinpos = ((const char *)q) - starts;
5595 endinpos = startinpos + 4;
5596 }
5597 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005598 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005600 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005602 startinpos = ((const char *)q) - starts;
5603 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005605 else {
5606 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005607 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005608 goto onError;
5609 q += 4;
5610 continue;
5611 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005612 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005613 startinpos = ((const char *)q) - starts;
5614 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005616
5617 /* The remaining input chars are ignored if the callback
5618 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005619 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005621 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005623 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005625 }
5626
Walter Dörwald41980ca2007-08-16 21:55:45 +00005627 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005629
Walter Dörwald41980ca2007-08-16 21:55:45 +00005630 Py_XDECREF(errorHandler);
5631 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005633
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005636 Py_XDECREF(errorHandler);
5637 Py_XDECREF(exc);
5638 return NULL;
5639}
5640
5641PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005642_PyUnicode_EncodeUTF32(PyObject *str,
5643 const char *errors,
5644 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005645{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005646 enum PyUnicode_Kind kind;
5647 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005648 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005649 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005650 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005651#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005652 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005653#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005654 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005655#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005656 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005657 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005658 PyObject *errorHandler = NULL;
5659 PyObject *exc = NULL;
5660 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005661
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005662 if (!PyUnicode_Check(str)) {
5663 PyErr_BadArgument();
5664 return NULL;
5665 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005666 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005667 return NULL;
5668 kind = PyUnicode_KIND(str);
5669 data = PyUnicode_DATA(str);
5670 len = PyUnicode_GET_LENGTH(str);
5671
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005672 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005673 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005674 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005675 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005676 if (v == NULL)
5677 return NULL;
5678
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005679 /* output buffer is 4-bytes aligned */
5680 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005681 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005682 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005683 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005684 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005685 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005686
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005687 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005688 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005689 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005690 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005691 else
5692 encoding = "utf-32";
5693
5694 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005695 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5696 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005697 }
5698
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005699 pos = 0;
5700 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005701 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005702
5703 if (kind == PyUnicode_2BYTE_KIND) {
5704 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5705 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005706 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005707 else {
5708 assert(kind == PyUnicode_4BYTE_KIND);
5709 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5710 &out, native_ordering);
5711 }
5712 if (pos == len)
5713 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005714
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 rep = unicode_encode_call_errorhandler(
5716 errors, &errorHandler,
5717 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005718 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 if (!rep)
5720 goto error;
5721
5722 if (PyBytes_Check(rep)) {
5723 repsize = PyBytes_GET_SIZE(rep);
5724 if (repsize & 3) {
5725 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005726 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005727 "surrogates not allowed");
5728 goto error;
5729 }
5730 moreunits = repsize / 4;
5731 }
5732 else {
5733 assert(PyUnicode_Check(rep));
5734 if (PyUnicode_READY(rep) < 0)
5735 goto error;
5736 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5737 if (!PyUnicode_IS_ASCII(rep)) {
5738 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005739 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005740 "surrogates not allowed");
5741 goto error;
5742 }
5743 }
5744
5745 /* four bytes are reserved for each surrogate */
5746 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005747 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005748 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005749 /* integer overflow */
5750 PyErr_NoMemory();
5751 goto error;
5752 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005753 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005754 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005755 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005756 }
5757
5758 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005759 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005760 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005762 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005763 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5764 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 }
5766
5767 Py_CLEAR(rep);
5768 }
5769
5770 /* Cut back to size actually needed. This is necessary for, for example,
5771 encoding of a string containing isolated surrogates and the 'ignore'
5772 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005773 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 if (nsize != PyBytes_GET_SIZE(v))
5775 _PyBytes_Resize(&v, nsize);
5776 Py_XDECREF(errorHandler);
5777 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005778 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005779 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005780 error:
5781 Py_XDECREF(rep);
5782 Py_XDECREF(errorHandler);
5783 Py_XDECREF(exc);
5784 Py_XDECREF(v);
5785 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005786}
5787
Alexander Belopolsky40018472011-02-26 01:02:56 +00005788PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005789PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5790 Py_ssize_t size,
5791 const char *errors,
5792 int byteorder)
5793{
5794 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005795 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005796 if (tmp == NULL)
5797 return NULL;
5798 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5799 Py_DECREF(tmp);
5800 return result;
5801}
5802
5803PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005804PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005805{
Victor Stinnerb960b342011-11-20 19:12:52 +01005806 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005807}
5808
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809/* --- UTF-16 Codec ------------------------------------------------------- */
5810
Tim Peters772747b2001-08-09 22:21:55 +00005811PyObject *
5812PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 Py_ssize_t size,
5814 const char *errors,
5815 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816{
Walter Dörwald69652032004-09-07 20:24:22 +00005817 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5818}
5819
5820PyObject *
5821PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 Py_ssize_t size,
5823 const char *errors,
5824 int *byteorder,
5825 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005826{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005827 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005828 Py_ssize_t startinpos;
5829 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005830 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005831 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005832 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005833 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005834 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 PyObject *errorHandler = NULL;
5836 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005837 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838
Andy Lestere6be9b52020-02-11 20:28:35 -06005839 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005840 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841
5842 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005843 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005845 /* Check for BOM marks (U+FEFF) in the input and adjust current
5846 byte order setting accordingly. In native mode, the leading BOM
5847 mark is skipped, in all other modes, it is copied to the output
5848 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005849 if (bo == 0 && size >= 2) {
5850 const Py_UCS4 bom = (q[1] << 8) | q[0];
5851 if (bom == 0xFEFF) {
5852 q += 2;
5853 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005855 else if (bom == 0xFFFE) {
5856 q += 2;
5857 bo = 1;
5858 }
5859 if (byteorder)
5860 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862
Antoine Pitrou63065d72012-05-15 23:48:04 +02005863 if (q == e) {
5864 if (consumed)
5865 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005866 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005867 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005868
Christian Heimes743e0cd2012-10-17 23:52:17 +02005869#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005870 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005871 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005872#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005873 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005874 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005875#endif
Tim Peters772747b2001-08-09 22:21:55 +00005876
Antoine Pitrou63065d72012-05-15 23:48:04 +02005877 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005878 character count normally. Error handler will take care of
5879 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005880 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005881 writer.min_length = (e - q + 1) / 2;
5882 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005883 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005884
Antoine Pitrou63065d72012-05-15 23:48:04 +02005885 while (1) {
5886 Py_UCS4 ch = 0;
5887 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005888 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005889 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005890 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005891 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005892 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005893 native_ordering);
5894 else
5895 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005896 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005897 native_ordering);
5898 } else if (kind == PyUnicode_2BYTE_KIND) {
5899 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005900 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005901 native_ordering);
5902 } else {
5903 assert(kind == PyUnicode_4BYTE_KIND);
5904 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005906 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005907 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909
Antoine Pitrou63065d72012-05-15 23:48:04 +02005910 switch (ch)
5911 {
5912 case 0:
5913 /* remaining byte at the end? (size should be even) */
5914 if (q == e || consumed)
5915 goto End;
5916 errmsg = "truncated data";
5917 startinpos = ((const char *)q) - starts;
5918 endinpos = ((const char *)e) - starts;
5919 break;
5920 /* The remaining input chars are ignored if the callback
5921 chooses to skip the input */
5922 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005923 q -= 2;
5924 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005925 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005926 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005927 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005928 endinpos = ((const char *)e) - starts;
5929 break;
5930 case 2:
5931 errmsg = "illegal encoding";
5932 startinpos = ((const char *)q) - 2 - starts;
5933 endinpos = startinpos + 2;
5934 break;
5935 case 3:
5936 errmsg = "illegal UTF-16 surrogate";
5937 startinpos = ((const char *)q) - 4 - starts;
5938 endinpos = startinpos + 2;
5939 break;
5940 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005941 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005942 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 continue;
5944 }
5945
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005946 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005947 errors,
5948 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005949 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005950 &starts,
5951 (const char **)&e,
5952 &startinpos,
5953 &endinpos,
5954 &exc,
5955 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005956 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
5959
Antoine Pitrou63065d72012-05-15 23:48:04 +02005960End:
Walter Dörwald69652032004-09-07 20:24:22 +00005961 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005964 Py_XDECREF(errorHandler);
5965 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005966 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005969 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 Py_XDECREF(errorHandler);
5971 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 return NULL;
5973}
5974
Tim Peters772747b2001-08-09 22:21:55 +00005975PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005976_PyUnicode_EncodeUTF16(PyObject *str,
5977 const char *errors,
5978 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005980 enum PyUnicode_Kind kind;
5981 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005982 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005983 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005984 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005985 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005986#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005987 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005988#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005989 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005990#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005991 const char *encoding;
5992 Py_ssize_t nsize, pos;
5993 PyObject *errorHandler = NULL;
5994 PyObject *exc = NULL;
5995 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005996
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005997 if (!PyUnicode_Check(str)) {
5998 PyErr_BadArgument();
5999 return NULL;
6000 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006001 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006002 return NULL;
6003 kind = PyUnicode_KIND(str);
6004 data = PyUnicode_DATA(str);
6005 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006006
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006007 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006008 if (kind == PyUnicode_4BYTE_KIND) {
6009 const Py_UCS4 *in = (const Py_UCS4 *)data;
6010 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006011 while (in < end) {
6012 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006014 }
6015 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006016 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006017 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006019 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006020 nsize = len + pairs + (byteorder == 0);
6021 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006022 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006026 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006027 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006028 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006029 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006030 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006031 }
6032 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006033 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006034 }
Tim Peters772747b2001-08-09 22:21:55 +00006035
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006036 if (kind == PyUnicode_1BYTE_KIND) {
6037 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6038 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006039 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006040
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006041 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006042 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006043 }
6044 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006045 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006046 }
6047 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006048 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006049 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006050
6051 pos = 0;
6052 while (pos < len) {
6053 Py_ssize_t repsize, moreunits;
6054
6055 if (kind == PyUnicode_2BYTE_KIND) {
6056 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6057 &out, native_ordering);
6058 }
6059 else {
6060 assert(kind == PyUnicode_4BYTE_KIND);
6061 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6062 &out, native_ordering);
6063 }
6064 if (pos == len)
6065 break;
6066
6067 rep = unicode_encode_call_errorhandler(
6068 errors, &errorHandler,
6069 encoding, "surrogates not allowed",
6070 str, &exc, pos, pos + 1, &pos);
6071 if (!rep)
6072 goto error;
6073
6074 if (PyBytes_Check(rep)) {
6075 repsize = PyBytes_GET_SIZE(rep);
6076 if (repsize & 1) {
6077 raise_encode_exception(&exc, encoding,
6078 str, pos - 1, pos,
6079 "surrogates not allowed");
6080 goto error;
6081 }
6082 moreunits = repsize / 2;
6083 }
6084 else {
6085 assert(PyUnicode_Check(rep));
6086 if (PyUnicode_READY(rep) < 0)
6087 goto error;
6088 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6089 if (!PyUnicode_IS_ASCII(rep)) {
6090 raise_encode_exception(&exc, encoding,
6091 str, pos - 1, pos,
6092 "surrogates not allowed");
6093 goto error;
6094 }
6095 }
6096
6097 /* two bytes are reserved for each surrogate */
6098 if (moreunits > 1) {
6099 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006100 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006101 /* integer overflow */
6102 PyErr_NoMemory();
6103 goto error;
6104 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006105 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006106 goto error;
6107 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6108 }
6109
6110 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006111 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006112 out += moreunits;
6113 } else /* rep is unicode */ {
6114 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6115 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6116 &out, native_ordering);
6117 }
6118
6119 Py_CLEAR(rep);
6120 }
6121
6122 /* Cut back to size actually needed. This is necessary for, for example,
6123 encoding of a string containing isolated surrogates and the 'ignore' handler
6124 is used. */
6125 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6126 if (nsize != PyBytes_GET_SIZE(v))
6127 _PyBytes_Resize(&v, nsize);
6128 Py_XDECREF(errorHandler);
6129 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006130 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006131 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006132 error:
6133 Py_XDECREF(rep);
6134 Py_XDECREF(errorHandler);
6135 Py_XDECREF(exc);
6136 Py_XDECREF(v);
6137 return NULL;
6138#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139}
6140
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6143 Py_ssize_t size,
6144 const char *errors,
6145 int byteorder)
6146{
6147 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006148 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149 if (tmp == NULL)
6150 return NULL;
6151 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6152 Py_DECREF(tmp);
6153 return result;
6154}
6155
6156PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006157PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160}
6161
6162/* --- Unicode Escape Codec ----------------------------------------------- */
6163
Fredrik Lundh06d12682001-01-24 07:59:11 +00006164static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006165
Alexander Belopolsky40018472011-02-26 01:02:56 +00006166PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006167_PyUnicode_DecodeUnicodeEscape(const char *s,
6168 Py_ssize_t size,
6169 const char *errors,
6170 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006173 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006175 PyObject *errorHandler = NULL;
6176 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006177
Eric V. Smith42454af2016-10-31 09:22:08 -04006178 // so we can remember if we've seen an invalid escape char or not
6179 *first_invalid_escape = NULL;
6180
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 }
6184 /* Escaped strings will always be longer than the resulting
6185 Unicode string, so we start with size here and then reduce the
6186 length after conversion to the true value.
6187 (but if the error callback returns a long replacement string
6188 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006189 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 writer.min_length = size;
6191 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6192 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006193 }
6194
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 end = s + size;
6196 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 unsigned char c = (unsigned char) *s++;
6198 Py_UCS4 ch;
6199 int count;
6200 Py_ssize_t startinpos;
6201 Py_ssize_t endinpos;
6202 const char *message;
6203
6204#define WRITE_ASCII_CHAR(ch) \
6205 do { \
6206 assert(ch <= 127); \
6207 assert(writer.pos < writer.size); \
6208 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6209 } while(0)
6210
6211#define WRITE_CHAR(ch) \
6212 do { \
6213 if (ch <= writer.maxchar) { \
6214 assert(writer.pos < writer.size); \
6215 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6216 } \
6217 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6218 goto onError; \
6219 } \
6220 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221
6222 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006223 if (c != '\\') {
6224 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 continue;
6226 }
6227
Victor Stinner62ec3312016-09-06 17:04:34 -07006228 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 if (s >= end) {
6231 message = "\\ at end of string";
6232 goto error;
6233 }
6234 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006235
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006237 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006240 case '\n': continue;
6241 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6242 case '\'': WRITE_ASCII_CHAR('\''); continue;
6243 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6244 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006245 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006246 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6247 case 't': WRITE_ASCII_CHAR('\t'); continue;
6248 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6249 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006250 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006252 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256 case '0': case '1': case '2': case '3':
6257 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006259 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 ch = (ch<<3) + *s++ - '0';
6261 if (s < end && '0' <= *s && *s <= '7') {
6262 ch = (ch<<3) + *s++ - '0';
6263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 WRITE_CHAR(ch);
6266 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 /* hex escapes */
6269 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006271 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006272 message = "truncated \\xXX escape";
6273 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006277 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006278 message = "truncated \\uXXXX escape";
6279 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006282 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006283 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006284 message = "truncated \\UXXXXXXXX escape";
6285 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006287 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006288 ch <<= 4;
6289 if (c >= '0' && c <= '9') {
6290 ch += c - '0';
6291 }
6292 else if (c >= 'a' && c <= 'f') {
6293 ch += c - ('a' - 10);
6294 }
6295 else if (c >= 'A' && c <= 'F') {
6296 ch += c - ('A' - 10);
6297 }
6298 else {
6299 break;
6300 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006301 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006303 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006304 }
6305
6306 /* when we get here, ch is a 32-bit unicode character */
6307 if (ch > MAX_UNICODE) {
6308 message = "illegal Unicode character";
6309 goto error;
6310 }
6311
6312 WRITE_CHAR(ch);
6313 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006314
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006316 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006317 if (ucnhash_CAPI == NULL) {
6318 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006319 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6320 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 if (ucnhash_CAPI == NULL) {
6322 PyErr_SetString(
6323 PyExc_UnicodeError,
6324 "\\N escapes not supported (can't load unicodedata module)"
6325 );
6326 goto onError;
6327 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006328 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006329
6330 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006331 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 const char *start = ++s;
6333 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006334 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006336 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006337 namelen = s - start;
6338 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006339 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006340 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 ch = 0xffffffff; /* in case 'getcode' messes up */
6342 if (namelen <= INT_MAX &&
6343 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6344 &ch, 0)) {
6345 assert(ch <= MAX_UNICODE);
6346 WRITE_CHAR(ch);
6347 continue;
6348 }
6349 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006350 }
6351 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006352 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006353
6354 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006355 if (*first_invalid_escape == NULL) {
6356 *first_invalid_escape = s-1; /* Back up one char, since we've
6357 already incremented s. */
6358 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006359 WRITE_ASCII_CHAR('\\');
6360 WRITE_CHAR(c);
6361 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006363
6364 error:
6365 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006367 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006368 errors, &errorHandler,
6369 "unicodeescape", message,
6370 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006371 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006372 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006374 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006375
6376#undef WRITE_ASCII_CHAR
6377#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006379
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006380 Py_XDECREF(errorHandler);
6381 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006382 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006383
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006385 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386 Py_XDECREF(errorHandler);
6387 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 return NULL;
6389}
6390
Eric V. Smith42454af2016-10-31 09:22:08 -04006391PyObject *
6392PyUnicode_DecodeUnicodeEscape(const char *s,
6393 Py_ssize_t size,
6394 const char *errors)
6395{
6396 const char *first_invalid_escape;
6397 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6398 &first_invalid_escape);
6399 if (result == NULL)
6400 return NULL;
6401 if (first_invalid_escape != NULL) {
6402 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6403 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006404 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006405 Py_DECREF(result);
6406 return NULL;
6407 }
6408 }
6409 return result;
6410}
6411
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006412/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
Alexander Belopolsky40018472011-02-26 01:02:56 +00006414PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006417 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
Ezio Melottie7f90372012-10-05 03:33:31 +03006424 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006425 escape.
6426
Ezio Melottie7f90372012-10-05 03:33:31 +03006427 For UCS1 strings it's '\xxx', 4 bytes per source character.
6428 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6429 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006430 */
6431
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006432 if (!PyUnicode_Check(unicode)) {
6433 PyErr_BadArgument();
6434 return NULL;
6435 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006436 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006437 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 }
Victor Stinner358af132015-10-12 22:36:57 +02006439
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006440 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 if (len == 0) {
6442 return PyBytes_FromStringAndSize(NULL, 0);
6443 }
6444
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445 kind = PyUnicode_KIND(unicode);
6446 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6448 bytes, and 1 byte characters 4. */
6449 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006450 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 return PyErr_NoMemory();
6452 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006453 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 if (repr == NULL) {
6455 return NULL;
6456 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006457
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006459 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006460 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006461
Victor Stinner62ec3312016-09-06 17:04:34 -07006462 /* U+0000-U+00ff range */
6463 if (ch < 0x100) {
6464 if (ch >= ' ' && ch < 127) {
6465 if (ch != '\\') {
6466 /* Copy printable US ASCII as-is */
6467 *p++ = (char) ch;
6468 }
6469 /* Escape backslashes */
6470 else {
6471 *p++ = '\\';
6472 *p++ = '\\';
6473 }
6474 }
Victor Stinner358af132015-10-12 22:36:57 +02006475
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 /* Map special whitespace to '\t', \n', '\r' */
6477 else if (ch == '\t') {
6478 *p++ = '\\';
6479 *p++ = 't';
6480 }
6481 else if (ch == '\n') {
6482 *p++ = '\\';
6483 *p++ = 'n';
6484 }
6485 else if (ch == '\r') {
6486 *p++ = '\\';
6487 *p++ = 'r';
6488 }
6489
6490 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6491 else {
6492 *p++ = '\\';
6493 *p++ = 'x';
6494 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6495 *p++ = Py_hexdigits[ch & 0x000F];
6496 }
Tim Petersced69f82003-09-16 20:30:58 +00006497 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006498 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 *p++ = '\\';
6501 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006502 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6503 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6504 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6505 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006507 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6508 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006509
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 /* Make sure that the first two digits are zero */
6511 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006512 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006513 *p++ = 'U';
6514 *p++ = '0';
6515 *p++ = '0';
6516 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6517 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6518 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6519 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6520 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6521 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 assert(p - PyBytes_AS_STRING(repr) > 0);
6526 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6527 return NULL;
6528 }
6529 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530}
6531
Alexander Belopolsky40018472011-02-26 01:02:56 +00006532PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006533PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6534 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006536 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006537 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006538 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006540 }
6541
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006542 result = PyUnicode_AsUnicodeEscapeString(tmp);
6543 Py_DECREF(tmp);
6544 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545}
6546
6547/* --- Raw Unicode Escape Codec ------------------------------------------- */
6548
Alexander Belopolsky40018472011-02-26 01:02:56 +00006549PyObject *
6550PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006551 Py_ssize_t size,
6552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006555 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006557 PyObject *errorHandler = NULL;
6558 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006559
Victor Stinner62ec3312016-09-06 17:04:34 -07006560 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006561 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006562 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006563
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 /* Escaped strings will always be longer than the resulting
6565 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 length after conversion to the true value. (But decoding error
6567 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006568 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006569 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6571 goto onError;
6572 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006573
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 end = s + size;
6575 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006576 unsigned char c = (unsigned char) *s++;
6577 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006578 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 Py_ssize_t startinpos;
6580 Py_ssize_t endinpos;
6581 const char *message;
6582
6583#define WRITE_CHAR(ch) \
6584 do { \
6585 if (ch <= writer.maxchar) { \
6586 assert(writer.pos < writer.size); \
6587 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6588 } \
6589 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6590 goto onError; \
6591 } \
6592 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006595 if (c != '\\' || s >= end) {
6596 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006598 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006599
Victor Stinner62ec3312016-09-06 17:04:34 -07006600 c = (unsigned char) *s++;
6601 if (c == 'u') {
6602 count = 4;
6603 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006605 else if (c == 'U') {
6606 count = 8;
6607 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006608 }
6609 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006610 assert(writer.pos < writer.size);
6611 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6612 WRITE_CHAR(c);
6613 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006614 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006615 startinpos = s - starts - 2;
6616
6617 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6618 for (ch = 0; count && s < end; ++s, --count) {
6619 c = (unsigned char)*s;
6620 ch <<= 4;
6621 if (c >= '0' && c <= '9') {
6622 ch += c - '0';
6623 }
6624 else if (c >= 'a' && c <= 'f') {
6625 ch += c - ('a' - 10);
6626 }
6627 else if (c >= 'A' && c <= 'F') {
6628 ch += c - ('A' - 10);
6629 }
6630 else {
6631 break;
6632 }
6633 }
6634 if (!count) {
6635 if (ch <= MAX_UNICODE) {
6636 WRITE_CHAR(ch);
6637 continue;
6638 }
6639 message = "\\Uxxxxxxxx out of range";
6640 }
6641
6642 endinpos = s-starts;
6643 writer.min_length = end - s + writer.pos;
6644 if (unicode_decode_call_errorhandler_writer(
6645 errors, &errorHandler,
6646 "rawunicodeescape", message,
6647 &starts, &end, &startinpos, &endinpos, &exc, &s,
6648 &writer)) {
6649 goto onError;
6650 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006651 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006652
6653#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006655 Py_XDECREF(errorHandler);
6656 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006657 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006658
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006660 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 Py_XDECREF(errorHandler);
6662 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665}
6666
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006667
Alexander Belopolsky40018472011-02-26 01:02:56 +00006668PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006669PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670{
Victor Stinner62ec3312016-09-06 17:04:34 -07006671 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006673 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006674 int kind;
6675 void *data;
6676 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006678 if (!PyUnicode_Check(unicode)) {
6679 PyErr_BadArgument();
6680 return NULL;
6681 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006682 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006683 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006684 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006685 kind = PyUnicode_KIND(unicode);
6686 data = PyUnicode_DATA(unicode);
6687 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006688 if (kind == PyUnicode_1BYTE_KIND) {
6689 return PyBytes_FromStringAndSize(data, len);
6690 }
Victor Stinner0e368262011-11-10 20:12:49 +01006691
Victor Stinner62ec3312016-09-06 17:04:34 -07006692 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6693 bytes, and 1 byte characters 4. */
6694 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006695
Victor Stinner62ec3312016-09-06 17:04:34 -07006696 if (len > PY_SSIZE_T_MAX / expandsize) {
6697 return PyErr_NoMemory();
6698 }
6699 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6700 if (repr == NULL) {
6701 return NULL;
6702 }
6703 if (len == 0) {
6704 return repr;
6705 }
6706
6707 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006708 for (pos = 0; pos < len; pos++) {
6709 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006710
Victor Stinner62ec3312016-09-06 17:04:34 -07006711 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6712 if (ch < 0x100) {
6713 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006714 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006715 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006716 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 *p++ = '\\';
6718 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006719 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6720 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6721 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6722 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006724 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6725 else {
6726 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6727 *p++ = '\\';
6728 *p++ = 'U';
6729 *p++ = '0';
6730 *p++ = '0';
6731 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6732 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6733 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6734 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6735 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6736 *p++ = Py_hexdigits[ch & 15];
6737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006739
Victor Stinner62ec3312016-09-06 17:04:34 -07006740 assert(p > PyBytes_AS_STRING(repr));
6741 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6742 return NULL;
6743 }
6744 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745}
6746
Alexander Belopolsky40018472011-02-26 01:02:56 +00006747PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006748PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6749 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006751 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006752 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006753 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006754 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006755 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6756 Py_DECREF(tmp);
6757 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
6760/* --- Latin-1 Codec ------------------------------------------------------ */
6761
Alexander Belopolsky40018472011-02-26 01:02:56 +00006762PyObject *
6763PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006764 Py_ssize_t size,
6765 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006768 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769}
6770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006772static void
6773make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006774 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006775 PyObject *unicode,
6776 Py_ssize_t startpos, Py_ssize_t endpos,
6777 const char *reason)
6778{
6779 if (*exceptionObject == NULL) {
6780 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006782 encoding, unicode, startpos, endpos, reason);
6783 }
6784 else {
6785 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6786 goto onError;
6787 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6788 goto onError;
6789 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6790 goto onError;
6791 return;
6792 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006793 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006794 }
6795}
6796
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006797/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006798static void
6799raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006800 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006801 PyObject *unicode,
6802 Py_ssize_t startpos, Py_ssize_t endpos,
6803 const char *reason)
6804{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006805 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006806 encoding, unicode, startpos, endpos, reason);
6807 if (*exceptionObject != NULL)
6808 PyCodec_StrictErrors(*exceptionObject);
6809}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810
6811/* error handling callback helper:
6812 build arguments, call the callback and check the arguments,
6813 put the result into newpos and return the replacement string, which
6814 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006815static PyObject *
6816unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006817 PyObject **errorHandler,
6818 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006819 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006820 Py_ssize_t startpos, Py_ssize_t endpos,
6821 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006823 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006824 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006825 PyObject *restuple;
6826 PyObject *resunicode;
6827
6828 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006830 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006832 }
6833
Benjamin Petersonbac79492012-01-14 13:34:47 -05006834 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835 return NULL;
6836 len = PyUnicode_GET_LENGTH(unicode);
6837
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006838 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842
Petr Viktorinffd97532020-02-11 17:46:57 +01006843 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006847 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 Py_DECREF(restuple);
6849 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006850 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006851 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 &resunicode, newpos)) {
6853 Py_DECREF(restuple);
6854 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006856 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6857 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6858 Py_DECREF(restuple);
6859 return NULL;
6860 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006862 *newpos = len + *newpos;
6863 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006864 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 Py_DECREF(restuple);
6866 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 Py_INCREF(resunicode);
6869 Py_DECREF(restuple);
6870 return resunicode;
6871}
6872
Alexander Belopolsky40018472011-02-26 01:02:56 +00006873static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006874unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006875 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006876 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006878 /* input state */
6879 Py_ssize_t pos=0, size;
6880 int kind;
6881 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006882 /* pointer into the output */
6883 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006884 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6885 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006886 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006888 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006889 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006890 /* output object */
6891 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892
Benjamin Petersonbac79492012-01-14 13:34:47 -05006893 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 return NULL;
6895 size = PyUnicode_GET_LENGTH(unicode);
6896 kind = PyUnicode_KIND(unicode);
6897 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006898 /* allocate enough for a simple encoding without
6899 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006900 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006901 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006902
6903 _PyBytesWriter_Init(&writer);
6904 str = _PyBytesWriter_Alloc(&writer, size);
6905 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006909 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006912 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006914 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006915 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006916 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006918 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006920 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006921 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006923
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006924 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006926
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006927 /* Only overallocate the buffer if it's not the last write */
6928 writer.overallocate = (collend < size);
6929
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006931 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006932 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006933
6934 switch (error_handler) {
6935 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006936 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006938
6939 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006940 memset(str, '?', collend - collstart);
6941 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006942 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006943 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006944 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 break;
Victor Stinner50149202015-09-22 00:26:54 +02006946
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006947 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006948 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006949 writer.min_size -= (collend - collstart);
6950 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006951 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006952 if (str == NULL)
6953 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006954 pos = collend;
6955 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006956
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006957 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006958 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006959 writer.min_size -= (collend - collstart);
6960 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006961 unicode, collstart, collend);
6962 if (str == NULL)
6963 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006964 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 break;
Victor Stinner50149202015-09-22 00:26:54 +02006966
Victor Stinnerc3713e92015-09-29 12:32:13 +02006967 case _Py_ERROR_SURROGATEESCAPE:
6968 for (i = collstart; i < collend; ++i) {
6969 ch = PyUnicode_READ(kind, data, i);
6970 if (ch < 0xdc80 || 0xdcff < ch) {
6971 /* Not a UTF-8b surrogate */
6972 break;
6973 }
6974 *str++ = (char)(ch - 0xdc00);
6975 ++pos;
6976 }
6977 if (i >= collend)
6978 break;
6979 collstart = pos;
6980 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006981 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006982
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006984 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6985 encoding, reason, unicode, &exc,
6986 collstart, collend, &newpos);
6987 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006989
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006990 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006991 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006992
Victor Stinner6bd525b2015-10-09 13:10:05 +02006993 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006994 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006995 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006996 PyBytes_AS_STRING(rep),
6997 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006998 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006999 else {
7000 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007001
Victor Stinner6bd525b2015-10-09 13:10:05 +02007002 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007004
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007005 if (limit == 256 ?
7006 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7007 !PyUnicode_IS_ASCII(rep))
7008 {
7009 /* Not all characters are smaller than limit */
7010 raise_encode_exception(&exc, encoding, unicode,
7011 collstart, collend, reason);
7012 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007014 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7015 str = _PyBytesWriter_WriteBytes(&writer, str,
7016 PyUnicode_DATA(rep),
7017 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007018 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007019 if (str == NULL)
7020 goto onError;
7021
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007023 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007024 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007025
7026 /* If overallocation was disabled, ensure that it was the last
7027 write. Otherwise, we missed an optimization */
7028 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007029 }
7030 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007031
Victor Stinner50149202015-09-22 00:26:54 +02007032 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007033 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007034 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007035
7036 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007037 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007038 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007039 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007040 Py_XDECREF(exc);
7041 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007042}
7043
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007044/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007045PyObject *
7046PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007047 Py_ssize_t size,
7048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007050 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007051 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007052 if (unicode == NULL)
7053 return NULL;
7054 result = unicode_encode_ucs1(unicode, errors, 256);
7055 Py_DECREF(unicode);
7056 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057}
7058
Alexander Belopolsky40018472011-02-26 01:02:56 +00007059PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007060_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061{
7062 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 PyErr_BadArgument();
7064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007066 if (PyUnicode_READY(unicode) == -1)
7067 return NULL;
7068 /* Fast path: if it is a one-byte string, construct
7069 bytes object directly. */
7070 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7071 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7072 PyUnicode_GET_LENGTH(unicode));
7073 /* Non-Latin-1 characters present. Defer to above function to
7074 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007075 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007076}
7077
7078PyObject*
7079PyUnicode_AsLatin1String(PyObject *unicode)
7080{
7081 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082}
7083
7084/* --- 7-bit ASCII Codec -------------------------------------------------- */
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyObject *
7087PyUnicode_DecodeASCII(const char *s,
7088 Py_ssize_t size,
7089 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007092 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007093 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007094 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007095 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007096
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007098 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007099
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007101 if (size == 1 && (unsigned char)s[0] < 128)
7102 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007103
Inada Naoki770847a2019-06-24 12:30:24 +09007104 // Shortcut for simple case
7105 PyObject *u = PyUnicode_New(size, 127);
7106 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007107 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007108 }
7109 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7110 if (outpos == size) {
7111 return u;
7112 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007113
Inada Naoki770847a2019-06-24 12:30:24 +09007114 _PyUnicodeWriter writer;
7115 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007116 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007117
Inada Naoki770847a2019-06-24 12:30:24 +09007118 s += outpos;
7119 int kind = writer.kind;
7120 void *data = writer.data;
7121 Py_ssize_t startinpos, endinpos;
7122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007123 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007124 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007126 PyUnicode_WRITE(kind, data, writer.pos, c);
7127 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007129 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007131
7132 /* byte outsize range 0x00..0x7f: call the error handler */
7133
7134 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007135 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007136
7137 switch (error_handler)
7138 {
7139 case _Py_ERROR_REPLACE:
7140 case _Py_ERROR_SURROGATEESCAPE:
7141 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007142 but we may switch to UCS2 at the first write */
7143 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7144 goto onError;
7145 kind = writer.kind;
7146 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007147
7148 if (error_handler == _Py_ERROR_REPLACE)
7149 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7150 else
7151 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7152 writer.pos++;
7153 ++s;
7154 break;
7155
7156 case _Py_ERROR_IGNORE:
7157 ++s;
7158 break;
7159
7160 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 startinpos = s-starts;
7162 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007163 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007164 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 "ascii", "ordinal not in range(128)",
7166 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007167 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007169 kind = writer.kind;
7170 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007173 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007174 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007175 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007176
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007178 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007179 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 return NULL;
7182}
7183
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007184/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007185PyObject *
7186PyUnicode_EncodeASCII(const Py_UNICODE *p,
7187 Py_ssize_t size,
7188 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007190 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007191 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007192 if (unicode == NULL)
7193 return NULL;
7194 result = unicode_encode_ucs1(unicode, errors, 128);
7195 Py_DECREF(unicode);
7196 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197}
7198
Alexander Belopolsky40018472011-02-26 01:02:56 +00007199PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007200_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201{
7202 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 PyErr_BadArgument();
7204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007206 if (PyUnicode_READY(unicode) == -1)
7207 return NULL;
7208 /* Fast path: if it is an ASCII-only string, construct bytes object
7209 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007210 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007211 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7212 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007213 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007214}
7215
7216PyObject *
7217PyUnicode_AsASCIIString(PyObject *unicode)
7218{
7219 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220}
7221
Steve Dowercc16be82016-09-08 10:35:16 -07007222#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007223
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007224/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007225
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007226#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227#define NEED_RETRY
7228#endif
7229
Steve Dower7ebdda02019-08-21 16:22:33 -07007230/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7231 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7232 both cases also and avoids partial characters overrunning the
7233 length limit in MultiByteToWideChar on Windows */
7234#define DECODING_CHUNK_SIZE (INT_MAX/4)
7235
Victor Stinner3a50e702011-10-18 21:21:00 +02007236#ifndef WC_ERR_INVALID_CHARS
7237# define WC_ERR_INVALID_CHARS 0x0080
7238#endif
7239
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007240static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007241code_page_name(UINT code_page, PyObject **obj)
7242{
7243 *obj = NULL;
7244 if (code_page == CP_ACP)
7245 return "mbcs";
7246 if (code_page == CP_UTF7)
7247 return "CP_UTF7";
7248 if (code_page == CP_UTF8)
7249 return "CP_UTF8";
7250
7251 *obj = PyBytes_FromFormat("cp%u", code_page);
7252 if (*obj == NULL)
7253 return NULL;
7254 return PyBytes_AS_STRING(*obj);
7255}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007256
Victor Stinner3a50e702011-10-18 21:21:00 +02007257static DWORD
7258decode_code_page_flags(UINT code_page)
7259{
7260 if (code_page == CP_UTF7) {
7261 /* The CP_UTF7 decoder only supports flags=0 */
7262 return 0;
7263 }
7264 else
7265 return MB_ERR_INVALID_CHARS;
7266}
7267
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007268/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 * Decode a byte string from a Windows code page into unicode object in strict
7270 * mode.
7271 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007272 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7273 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007274 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007275static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007276decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007277 wchar_t **buf,
7278 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007279 const char *in,
7280 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007282 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007283 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007285
7286 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007288 while ((outsize = MultiByteToWideChar(code_page, flags,
7289 in, insize, NULL, 0)) <= 0)
7290 {
7291 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7292 goto error;
7293 }
7294 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7295 flags = 0;
7296 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007298 /* Extend a wchar_t* buffer */
7299 Py_ssize_t n = *bufsize; /* Get the current length */
7300 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7301 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007302 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007303 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
7305 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7307 if (outsize <= 0)
7308 goto error;
7309 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007310
Victor Stinner3a50e702011-10-18 21:21:00 +02007311error:
7312 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7313 return -2;
7314 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007315 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316}
7317
Victor Stinner3a50e702011-10-18 21:21:00 +02007318/*
7319 * Decode a byte string from a code page into unicode object with an error
7320 * handler.
7321 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007322 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 * UnicodeDecodeError exception and returns -1 on error.
7324 */
7325static int
7326decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007327 wchar_t **buf,
7328 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007329 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007330 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007331{
7332 const char *startin = in;
7333 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007334 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 /* Ideally, we should get reason from FormatMessage. This is the Windows
7336 2000 English version of the message. */
7337 const char *reason = "No mapping for the Unicode character exists "
7338 "in the target code page.";
7339 /* each step cannot decode more than 1 character, but a character can be
7340 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007341 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007342 int insize;
7343 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007344 PyObject *errorHandler = NULL;
7345 PyObject *exc = NULL;
7346 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007347 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 DWORD err;
7349 int ret = -1;
7350
7351 assert(size > 0);
7352
7353 encoding = code_page_name(code_page, &encoding_obj);
7354 if (encoding == NULL)
7355 return -1;
7356
Victor Stinner7d00cc12014-03-17 23:08:06 +01007357 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7359 UnicodeDecodeError. */
7360 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7361 if (exc != NULL) {
7362 PyCodec_StrictErrors(exc);
7363 Py_CLEAR(exc);
7364 }
7365 goto error;
7366 }
7367
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007368 /* Extend a wchar_t* buffer */
7369 Py_ssize_t n = *bufsize; /* Get the current length */
7370 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7371 PyErr_NoMemory();
7372 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007374 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7375 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007377 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007378
7379 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 while (in < endin)
7381 {
7382 /* Decode a character */
7383 insize = 1;
7384 do
7385 {
7386 outsize = MultiByteToWideChar(code_page, flags,
7387 in, insize,
7388 buffer, Py_ARRAY_LENGTH(buffer));
7389 if (outsize > 0)
7390 break;
7391 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007392 if (err == ERROR_INVALID_FLAGS && flags) {
7393 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7394 flags = 0;
7395 continue;
7396 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 if (err != ERROR_NO_UNICODE_TRANSLATION
7398 && err != ERROR_INSUFFICIENT_BUFFER)
7399 {
7400 PyErr_SetFromWindowsErr(0);
7401 goto error;
7402 }
7403 insize++;
7404 }
7405 /* 4=maximum length of a UTF-8 sequence */
7406 while (insize <= 4 && (in + insize) <= endin);
7407
7408 if (outsize <= 0) {
7409 Py_ssize_t startinpos, endinpos, outpos;
7410
Victor Stinner7d00cc12014-03-17 23:08:06 +01007411 /* last character in partial decode? */
7412 if (in + insize >= endin && !final)
7413 break;
7414
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 startinpos = in - startin;
7416 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007417 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007418 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 errors, &errorHandler,
7420 encoding, reason,
7421 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007422 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 {
7424 goto error;
7425 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007426 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 }
7428 else {
7429 in += insize;
7430 memcpy(out, buffer, outsize * sizeof(wchar_t));
7431 out += outsize;
7432 }
7433 }
7434
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007435 /* Shrink the buffer */
7436 assert(out - *buf <= *bufsize);
7437 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007438 /* (in - startin) <= size and size is an int */
7439 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007440
7441error:
7442 Py_XDECREF(encoding_obj);
7443 Py_XDECREF(errorHandler);
7444 Py_XDECREF(exc);
7445 return ret;
7446}
7447
Victor Stinner3a50e702011-10-18 21:21:00 +02007448static PyObject *
7449decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007450 const char *s, Py_ssize_t size,
7451 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007453 wchar_t *buf = NULL;
7454 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 if (code_page < 0) {
7458 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7459 return NULL;
7460 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007461 if (size < 0) {
7462 PyErr_BadInternalCall();
7463 return NULL;
7464 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007465
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007466 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468
Victor Stinner76a31a62011-11-04 00:05:13 +01007469 do
7470 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007472 if (size > DECODING_CHUNK_SIZE) {
7473 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007474 final = 0;
7475 done = 0;
7476 }
7477 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007479 {
7480 chunk_size = (int)size;
7481 final = (consumed == NULL);
7482 done = 1;
7483 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007486 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007487 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007488 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007489 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007491 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007492 s, chunk_size);
7493 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007494 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007495 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007496 errors, final);
7497 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007498
7499 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007500 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 return NULL;
7502 }
7503
7504 if (consumed)
7505 *consumed += converted;
7506
7507 s += converted;
7508 size -= converted;
7509 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007510
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007511 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7512 PyMem_Free(buf);
7513 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514}
7515
Alexander Belopolsky40018472011-02-26 01:02:56 +00007516PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007517PyUnicode_DecodeCodePageStateful(int code_page,
7518 const char *s,
7519 Py_ssize_t size,
7520 const char *errors,
7521 Py_ssize_t *consumed)
7522{
7523 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7524}
7525
7526PyObject *
7527PyUnicode_DecodeMBCSStateful(const char *s,
7528 Py_ssize_t size,
7529 const char *errors,
7530 Py_ssize_t *consumed)
7531{
7532 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7533}
7534
7535PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007536PyUnicode_DecodeMBCS(const char *s,
7537 Py_ssize_t size,
7538 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007539{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007540 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7541}
7542
Victor Stinner3a50e702011-10-18 21:21:00 +02007543static DWORD
7544encode_code_page_flags(UINT code_page, const char *errors)
7545{
7546 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007547 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 }
7549 else if (code_page == CP_UTF7) {
7550 /* CP_UTF7 only supports flags=0 */
7551 return 0;
7552 }
7553 else {
7554 if (errors != NULL && strcmp(errors, "replace") == 0)
7555 return 0;
7556 else
7557 return WC_NO_BEST_FIT_CHARS;
7558 }
7559}
7560
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007561/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 * Encode a Unicode string to a Windows code page into a byte string in strict
7563 * mode.
7564 *
7565 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007566 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007567 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007568static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007569encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007570 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007572{
Victor Stinner554f3f02010-06-16 23:33:54 +00007573 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 BOOL *pusedDefaultChar = &usedDefaultChar;
7575 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007576 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007577 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 const DWORD flags = encode_code_page_flags(code_page, NULL);
7579 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007580 /* Create a substring so that we can get the UTF-16 representation
7581 of just the slice under consideration. */
7582 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007583
Martin v. Löwis3d325192011-11-04 18:23:06 +01007584 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007585
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007587 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007589 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007590
Victor Stinner2fc507f2011-11-04 20:06:39 +01007591 substring = PyUnicode_Substring(unicode, offset, offset+len);
7592 if (substring == NULL)
7593 return -1;
7594 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7595 if (p == NULL) {
7596 Py_DECREF(substring);
7597 return -1;
7598 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007599 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007600
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007601 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007603 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 NULL, 0,
7605 NULL, pusedDefaultChar);
7606 if (outsize <= 0)
7607 goto error;
7608 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007609 if (pusedDefaultChar && *pusedDefaultChar) {
7610 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007612 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007613
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007617 if (*outbytes == NULL) {
7618 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007620 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007622 }
7623 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 const Py_ssize_t n = PyBytes_Size(*outbytes);
7626 if (outsize > PY_SSIZE_T_MAX - n) {
7627 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007628 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007631 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7632 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007633 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007634 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007635 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007636 }
7637
7638 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007640 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 out, outsize,
7642 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007643 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 if (outsize <= 0)
7645 goto error;
7646 if (pusedDefaultChar && *pusedDefaultChar)
7647 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007648 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007649
Victor Stinner3a50e702011-10-18 21:21:00 +02007650error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007651 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7653 return -2;
7654 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007655 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007656}
7657
Victor Stinner3a50e702011-10-18 21:21:00 +02007658/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007659 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 * error handler.
7661 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007662 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007663 * -1 on other error.
7664 */
7665static int
7666encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007667 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007668 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007669{
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007671 Py_ssize_t pos = unicode_offset;
7672 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 /* Ideally, we should get reason from FormatMessage. This is the Windows
7674 2000 English version of the message. */
7675 const char *reason = "invalid character";
7676 /* 4=maximum length of a UTF-8 sequence */
7677 char buffer[4];
7678 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7679 Py_ssize_t outsize;
7680 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007681 PyObject *errorHandler = NULL;
7682 PyObject *exc = NULL;
7683 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007684 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007685 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007686 PyObject *rep;
7687 int ret = -1;
7688
7689 assert(insize > 0);
7690
7691 encoding = code_page_name(code_page, &encoding_obj);
7692 if (encoding == NULL)
7693 return -1;
7694
7695 if (errors == NULL || strcmp(errors, "strict") == 0) {
7696 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7697 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007698 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 if (exc != NULL) {
7700 PyCodec_StrictErrors(exc);
7701 Py_DECREF(exc);
7702 }
7703 Py_XDECREF(encoding_obj);
7704 return -1;
7705 }
7706
7707 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7708 pusedDefaultChar = &usedDefaultChar;
7709 else
7710 pusedDefaultChar = NULL;
7711
7712 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7713 PyErr_NoMemory();
7714 goto error;
7715 }
7716 outsize = insize * Py_ARRAY_LENGTH(buffer);
7717
7718 if (*outbytes == NULL) {
7719 /* Create string object */
7720 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7721 if (*outbytes == NULL)
7722 goto error;
7723 out = PyBytes_AS_STRING(*outbytes);
7724 }
7725 else {
7726 /* Extend string object */
7727 Py_ssize_t n = PyBytes_Size(*outbytes);
7728 if (n > PY_SSIZE_T_MAX - outsize) {
7729 PyErr_NoMemory();
7730 goto error;
7731 }
7732 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7733 goto error;
7734 out = PyBytes_AS_STRING(*outbytes) + n;
7735 }
7736
7737 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007739 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007740 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7741 wchar_t chars[2];
7742 int charsize;
7743 if (ch < 0x10000) {
7744 chars[0] = (wchar_t)ch;
7745 charsize = 1;
7746 }
7747 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007748 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7749 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007750 charsize = 2;
7751 }
7752
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007754 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007755 buffer, Py_ARRAY_LENGTH(buffer),
7756 NULL, pusedDefaultChar);
7757 if (outsize > 0) {
7758 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7759 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007760 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007761 memcpy(out, buffer, outsize);
7762 out += outsize;
7763 continue;
7764 }
7765 }
7766 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7767 PyErr_SetFromWindowsErr(0);
7768 goto error;
7769 }
7770
Victor Stinner3a50e702011-10-18 21:21:00 +02007771 rep = unicode_encode_call_errorhandler(
7772 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007773 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007774 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007775 if (rep == NULL)
7776 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007777 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007778
7779 if (PyBytes_Check(rep)) {
7780 outsize = PyBytes_GET_SIZE(rep);
7781 if (outsize != 1) {
7782 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7783 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7784 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7785 Py_DECREF(rep);
7786 goto error;
7787 }
7788 out = PyBytes_AS_STRING(*outbytes) + offset;
7789 }
7790 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7791 out += outsize;
7792 }
7793 else {
7794 Py_ssize_t i;
7795 enum PyUnicode_Kind kind;
7796 void *data;
7797
Benjamin Petersonbac79492012-01-14 13:34:47 -05007798 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007799 Py_DECREF(rep);
7800 goto error;
7801 }
7802
7803 outsize = PyUnicode_GET_LENGTH(rep);
7804 if (outsize != 1) {
7805 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7806 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7807 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7808 Py_DECREF(rep);
7809 goto error;
7810 }
7811 out = PyBytes_AS_STRING(*outbytes) + offset;
7812 }
7813 kind = PyUnicode_KIND(rep);
7814 data = PyUnicode_DATA(rep);
7815 for (i=0; i < outsize; i++) {
7816 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7817 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007818 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007819 encoding, unicode,
7820 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007821 "unable to encode error handler result to ASCII");
7822 Py_DECREF(rep);
7823 goto error;
7824 }
7825 *out = (unsigned char)ch;
7826 out++;
7827 }
7828 }
7829 Py_DECREF(rep);
7830 }
7831 /* write a NUL byte */
7832 *out = 0;
7833 outsize = out - PyBytes_AS_STRING(*outbytes);
7834 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7835 if (_PyBytes_Resize(outbytes, outsize) < 0)
7836 goto error;
7837 ret = 0;
7838
7839error:
7840 Py_XDECREF(encoding_obj);
7841 Py_XDECREF(errorHandler);
7842 Py_XDECREF(exc);
7843 return ret;
7844}
7845
Victor Stinner3a50e702011-10-18 21:21:00 +02007846static PyObject *
7847encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007848 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007849 const char *errors)
7850{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007851 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007852 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007853 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007854 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007855
Victor Stinner29dacf22015-01-26 16:41:32 +01007856 if (!PyUnicode_Check(unicode)) {
7857 PyErr_BadArgument();
7858 return NULL;
7859 }
7860
Benjamin Petersonbac79492012-01-14 13:34:47 -05007861 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007862 return NULL;
7863 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007864
Victor Stinner3a50e702011-10-18 21:21:00 +02007865 if (code_page < 0) {
7866 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7867 return NULL;
7868 }
7869
Martin v. Löwis3d325192011-11-04 18:23:06 +01007870 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007871 return PyBytes_FromStringAndSize(NULL, 0);
7872
Victor Stinner7581cef2011-11-03 22:32:33 +01007873 offset = 0;
7874 do
7875 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007876#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007877 if (len > DECODING_CHUNK_SIZE) {
7878 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007879 done = 0;
7880 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007881 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007882#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007883 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007884 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007885 done = 1;
7886 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007887
Victor Stinner76a31a62011-11-04 00:05:13 +01007888 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007889 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007890 errors);
7891 if (ret == -2)
7892 ret = encode_code_page_errors(code_page, &outbytes,
7893 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007894 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007895 if (ret < 0) {
7896 Py_XDECREF(outbytes);
7897 return NULL;
7898 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007899
Victor Stinner7581cef2011-11-03 22:32:33 +01007900 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007901 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007902 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007903
Victor Stinner3a50e702011-10-18 21:21:00 +02007904 return outbytes;
7905}
7906
7907PyObject *
7908PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7909 Py_ssize_t size,
7910 const char *errors)
7911{
Victor Stinner7581cef2011-11-03 22:32:33 +01007912 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007913 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007914 if (unicode == NULL)
7915 return NULL;
7916 res = encode_code_page(CP_ACP, unicode, errors);
7917 Py_DECREF(unicode);
7918 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007919}
7920
7921PyObject *
7922PyUnicode_EncodeCodePage(int code_page,
7923 PyObject *unicode,
7924 const char *errors)
7925{
Victor Stinner7581cef2011-11-03 22:32:33 +01007926 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007927}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007928
Alexander Belopolsky40018472011-02-26 01:02:56 +00007929PyObject *
7930PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007931{
Victor Stinner7581cef2011-11-03 22:32:33 +01007932 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007933}
7934
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007935#undef NEED_RETRY
7936
Steve Dowercc16be82016-09-08 10:35:16 -07007937#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007938
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939/* --- Character Mapping Codec -------------------------------------------- */
7940
Victor Stinnerfb161b12013-04-18 01:44:27 +02007941static int
7942charmap_decode_string(const char *s,
7943 Py_ssize_t size,
7944 PyObject *mapping,
7945 const char *errors,
7946 _PyUnicodeWriter *writer)
7947{
7948 const char *starts = s;
7949 const char *e;
7950 Py_ssize_t startinpos, endinpos;
7951 PyObject *errorHandler = NULL, *exc = NULL;
7952 Py_ssize_t maplen;
7953 enum PyUnicode_Kind mapkind;
7954 void *mapdata;
7955 Py_UCS4 x;
7956 unsigned char ch;
7957
7958 if (PyUnicode_READY(mapping) == -1)
7959 return -1;
7960
7961 maplen = PyUnicode_GET_LENGTH(mapping);
7962 mapdata = PyUnicode_DATA(mapping);
7963 mapkind = PyUnicode_KIND(mapping);
7964
7965 e = s + size;
7966
7967 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7968 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7969 * is disabled in encoding aliases, latin1 is preferred because
7970 * its implementation is faster. */
7971 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7972 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7973 Py_UCS4 maxchar = writer->maxchar;
7974
7975 assert (writer->kind == PyUnicode_1BYTE_KIND);
7976 while (s < e) {
7977 ch = *s;
7978 x = mapdata_ucs1[ch];
7979 if (x > maxchar) {
7980 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7981 goto onError;
7982 maxchar = writer->maxchar;
7983 outdata = (Py_UCS1 *)writer->data;
7984 }
7985 outdata[writer->pos] = x;
7986 writer->pos++;
7987 ++s;
7988 }
7989 return 0;
7990 }
7991
7992 while (s < e) {
7993 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7994 enum PyUnicode_Kind outkind = writer->kind;
7995 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7996 if (outkind == PyUnicode_1BYTE_KIND) {
7997 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7998 Py_UCS4 maxchar = writer->maxchar;
7999 while (s < e) {
8000 ch = *s;
8001 x = mapdata_ucs2[ch];
8002 if (x > maxchar)
8003 goto Error;
8004 outdata[writer->pos] = x;
8005 writer->pos++;
8006 ++s;
8007 }
8008 break;
8009 }
8010 else if (outkind == PyUnicode_2BYTE_KIND) {
8011 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8012 while (s < e) {
8013 ch = *s;
8014 x = mapdata_ucs2[ch];
8015 if (x == 0xFFFE)
8016 goto Error;
8017 outdata[writer->pos] = x;
8018 writer->pos++;
8019 ++s;
8020 }
8021 break;
8022 }
8023 }
8024 ch = *s;
8025
8026 if (ch < maplen)
8027 x = PyUnicode_READ(mapkind, mapdata, ch);
8028 else
8029 x = 0xfffe; /* invalid value */
8030Error:
8031 if (x == 0xfffe)
8032 {
8033 /* undefined mapping */
8034 startinpos = s-starts;
8035 endinpos = startinpos+1;
8036 if (unicode_decode_call_errorhandler_writer(
8037 errors, &errorHandler,
8038 "charmap", "character maps to <undefined>",
8039 &starts, &e, &startinpos, &endinpos, &exc, &s,
8040 writer)) {
8041 goto onError;
8042 }
8043 continue;
8044 }
8045
8046 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8047 goto onError;
8048 ++s;
8049 }
8050 Py_XDECREF(errorHandler);
8051 Py_XDECREF(exc);
8052 return 0;
8053
8054onError:
8055 Py_XDECREF(errorHandler);
8056 Py_XDECREF(exc);
8057 return -1;
8058}
8059
8060static int
8061charmap_decode_mapping(const char *s,
8062 Py_ssize_t size,
8063 PyObject *mapping,
8064 const char *errors,
8065 _PyUnicodeWriter *writer)
8066{
8067 const char *starts = s;
8068 const char *e;
8069 Py_ssize_t startinpos, endinpos;
8070 PyObject *errorHandler = NULL, *exc = NULL;
8071 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008072 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008073
8074 e = s + size;
8075
8076 while (s < e) {
8077 ch = *s;
8078
8079 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8080 key = PyLong_FromLong((long)ch);
8081 if (key == NULL)
8082 goto onError;
8083
8084 item = PyObject_GetItem(mapping, key);
8085 Py_DECREF(key);
8086 if (item == NULL) {
8087 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8088 /* No mapping found means: mapping is undefined. */
8089 PyErr_Clear();
8090 goto Undefined;
8091 } else
8092 goto onError;
8093 }
8094
8095 /* Apply mapping */
8096 if (item == Py_None)
8097 goto Undefined;
8098 if (PyLong_Check(item)) {
8099 long value = PyLong_AS_LONG(item);
8100 if (value == 0xFFFE)
8101 goto Undefined;
8102 if (value < 0 || value > MAX_UNICODE) {
8103 PyErr_Format(PyExc_TypeError,
8104 "character mapping must be in range(0x%lx)",
8105 (unsigned long)MAX_UNICODE + 1);
8106 goto onError;
8107 }
8108
8109 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8110 goto onError;
8111 }
8112 else if (PyUnicode_Check(item)) {
8113 if (PyUnicode_READY(item) == -1)
8114 goto onError;
8115 if (PyUnicode_GET_LENGTH(item) == 1) {
8116 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8117 if (value == 0xFFFE)
8118 goto Undefined;
8119 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8120 goto onError;
8121 }
8122 else {
8123 writer->overallocate = 1;
8124 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8125 goto onError;
8126 }
8127 }
8128 else {
8129 /* wrong return value */
8130 PyErr_SetString(PyExc_TypeError,
8131 "character mapping must return integer, None or str");
8132 goto onError;
8133 }
8134 Py_CLEAR(item);
8135 ++s;
8136 continue;
8137
8138Undefined:
8139 /* undefined mapping */
8140 Py_CLEAR(item);
8141 startinpos = s-starts;
8142 endinpos = startinpos+1;
8143 if (unicode_decode_call_errorhandler_writer(
8144 errors, &errorHandler,
8145 "charmap", "character maps to <undefined>",
8146 &starts, &e, &startinpos, &endinpos, &exc, &s,
8147 writer)) {
8148 goto onError;
8149 }
8150 }
8151 Py_XDECREF(errorHandler);
8152 Py_XDECREF(exc);
8153 return 0;
8154
8155onError:
8156 Py_XDECREF(item);
8157 Py_XDECREF(errorHandler);
8158 Py_XDECREF(exc);
8159 return -1;
8160}
8161
Alexander Belopolsky40018472011-02-26 01:02:56 +00008162PyObject *
8163PyUnicode_DecodeCharmap(const char *s,
8164 Py_ssize_t size,
8165 PyObject *mapping,
8166 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008168 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008169
Guido van Rossumd57fd912000-03-10 22:53:23 +00008170 /* Default to Latin-1 */
8171 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008175 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008176 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008177 writer.min_length = size;
8178 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008180
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008181 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008182 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8183 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008184 }
8185 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008186 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8187 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008189 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008190
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008192 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008193 return NULL;
8194}
8195
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196/* Charmap encoding: the lookup table */
8197
Alexander Belopolsky40018472011-02-26 01:02:56 +00008198struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 PyObject_HEAD
8200 unsigned char level1[32];
8201 int count2, count3;
8202 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203};
8204
8205static PyObject*
8206encoding_map_size(PyObject *obj, PyObject* args)
8207{
8208 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008209 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211}
8212
8213static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 PyDoc_STR("Return the size (in bytes) of this object") },
8216 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217};
8218
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008219static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 "EncodingMap", /*tp_name*/
8222 sizeof(struct encoding_map), /*tp_basicsize*/
8223 0, /*tp_itemsize*/
8224 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008225 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008226 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 0, /*tp_getattr*/
8228 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008229 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 0, /*tp_repr*/
8231 0, /*tp_as_number*/
8232 0, /*tp_as_sequence*/
8233 0, /*tp_as_mapping*/
8234 0, /*tp_hash*/
8235 0, /*tp_call*/
8236 0, /*tp_str*/
8237 0, /*tp_getattro*/
8238 0, /*tp_setattro*/
8239 0, /*tp_as_buffer*/
8240 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8241 0, /*tp_doc*/
8242 0, /*tp_traverse*/
8243 0, /*tp_clear*/
8244 0, /*tp_richcompare*/
8245 0, /*tp_weaklistoffset*/
8246 0, /*tp_iter*/
8247 0, /*tp_iternext*/
8248 encoding_map_methods, /*tp_methods*/
8249 0, /*tp_members*/
8250 0, /*tp_getset*/
8251 0, /*tp_base*/
8252 0, /*tp_dict*/
8253 0, /*tp_descr_get*/
8254 0, /*tp_descr_set*/
8255 0, /*tp_dictoffset*/
8256 0, /*tp_init*/
8257 0, /*tp_alloc*/
8258 0, /*tp_new*/
8259 0, /*tp_free*/
8260 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261};
8262
8263PyObject*
8264PyUnicode_BuildEncodingMap(PyObject* string)
8265{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 PyObject *result;
8267 struct encoding_map *mresult;
8268 int i;
8269 int need_dict = 0;
8270 unsigned char level1[32];
8271 unsigned char level2[512];
8272 unsigned char *mlevel1, *mlevel2, *mlevel3;
8273 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 int kind;
8275 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008276 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008278
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008279 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008280 PyErr_BadArgument();
8281 return NULL;
8282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 kind = PyUnicode_KIND(string);
8284 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008285 length = PyUnicode_GET_LENGTH(string);
8286 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287 memset(level1, 0xFF, sizeof level1);
8288 memset(level2, 0xFF, sizeof level2);
8289
8290 /* If there isn't a one-to-one mapping of NULL to \0,
8291 or if there are non-BMP characters, we need to use
8292 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008294 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008295 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008296 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008297 ch = PyUnicode_READ(kind, data, i);
8298 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299 need_dict = 1;
8300 break;
8301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008303 /* unmapped character */
8304 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 l1 = ch >> 11;
8306 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008307 if (level1[l1] == 0xFF)
8308 level1[l1] = count2++;
8309 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008310 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008311 }
8312
8313 if (count2 >= 0xFF || count3 >= 0xFF)
8314 need_dict = 1;
8315
8316 if (need_dict) {
8317 PyObject *result = PyDict_New();
8318 PyObject *key, *value;
8319 if (!result)
8320 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008321 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008322 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008323 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 if (!key || !value)
8325 goto failed1;
8326 if (PyDict_SetItem(result, key, value) == -1)
8327 goto failed1;
8328 Py_DECREF(key);
8329 Py_DECREF(value);
8330 }
8331 return result;
8332 failed1:
8333 Py_XDECREF(key);
8334 Py_XDECREF(value);
8335 Py_DECREF(result);
8336 return NULL;
8337 }
8338
8339 /* Create a three-level trie */
8340 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8341 16*count2 + 128*count3 - 1);
8342 if (!result)
8343 return PyErr_NoMemory();
8344 PyObject_Init(result, &EncodingMapType);
8345 mresult = (struct encoding_map*)result;
8346 mresult->count2 = count2;
8347 mresult->count3 = count3;
8348 mlevel1 = mresult->level1;
8349 mlevel2 = mresult->level23;
8350 mlevel3 = mresult->level23 + 16*count2;
8351 memcpy(mlevel1, level1, 32);
8352 memset(mlevel2, 0xFF, 16*count2);
8353 memset(mlevel3, 0, 128*count3);
8354 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008355 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008357 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8358 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008359 /* unmapped character */
8360 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008361 o1 = ch>>11;
8362 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 i2 = 16*mlevel1[o1] + o2;
8364 if (mlevel2[i2] == 0xFF)
8365 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008366 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008367 i3 = 128*mlevel2[i2] + o3;
8368 mlevel3[i3] = i;
8369 }
8370 return result;
8371}
8372
8373static int
Victor Stinner22168992011-11-20 17:09:18 +01008374encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375{
8376 struct encoding_map *map = (struct encoding_map*)mapping;
8377 int l1 = c>>11;
8378 int l2 = (c>>7) & 0xF;
8379 int l3 = c & 0x7F;
8380 int i;
8381
Victor Stinner22168992011-11-20 17:09:18 +01008382 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384 if (c == 0)
8385 return 0;
8386 /* level 1*/
8387 i = map->level1[l1];
8388 if (i == 0xFF) {
8389 return -1;
8390 }
8391 /* level 2*/
8392 i = map->level23[16*i+l2];
8393 if (i == 0xFF) {
8394 return -1;
8395 }
8396 /* level 3 */
8397 i = map->level23[16*map->count2 + 128*i + l3];
8398 if (i == 0) {
8399 return -1;
8400 }
8401 return i;
8402}
8403
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404/* Lookup the character ch in the mapping. If the character
8405 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008406 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008407static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008408charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409{
Christian Heimes217cfd12007-12-02 14:31:20 +00008410 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 PyObject *x;
8412
8413 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 x = PyObject_GetItem(mapping, w);
8416 Py_DECREF(w);
8417 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8419 /* No mapping found means: mapping is undefined. */
8420 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008421 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 } else
8423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008425 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008427 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 long value = PyLong_AS_LONG(x);
8429 if (value < 0 || value > 255) {
8430 PyErr_SetString(PyExc_TypeError,
8431 "character mapping must be in range(256)");
8432 Py_DECREF(x);
8433 return NULL;
8434 }
8435 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008437 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 /* wrong return value */
8441 PyErr_Format(PyExc_TypeError,
8442 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008443 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 Py_DECREF(x);
8445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 }
8447}
8448
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008449static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008450charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008451{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8453 /* exponentially overallocate to minimize reallocations */
8454 if (requiredsize < 2*outsize)
8455 requiredsize = 2*outsize;
8456 if (_PyBytes_Resize(outobj, requiredsize))
8457 return -1;
8458 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008459}
8460
Benjamin Peterson14339b62009-01-31 16:36:08 +00008461typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008463} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008465 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 space is available. Return a new reference to the object that
8467 was put in the output buffer, or Py_None, if the mapping was undefined
8468 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008469 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008470static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008471charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008472 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008474 PyObject *rep;
8475 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008476 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477
Andy Lesterdffe4c02020-03-04 07:15:20 -06008478 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008479 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008481 if (res == -1)
8482 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 if (outsize<requiredsize)
8484 if (charmapencode_resize(outobj, outpos, requiredsize))
8485 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008486 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 outstart[(*outpos)++] = (char)res;
8488 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008489 }
8490
8491 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008494 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 Py_DECREF(rep);
8496 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008497 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 if (PyLong_Check(rep)) {
8499 Py_ssize_t requiredsize = *outpos+1;
8500 if (outsize<requiredsize)
8501 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8502 Py_DECREF(rep);
8503 return enc_EXCEPTION;
8504 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008505 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008507 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 else {
8509 const char *repchars = PyBytes_AS_STRING(rep);
8510 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8511 Py_ssize_t requiredsize = *outpos+repsize;
8512 if (outsize<requiredsize)
8513 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8514 Py_DECREF(rep);
8515 return enc_EXCEPTION;
8516 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008517 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 memcpy(outstart + *outpos, repchars, repsize);
8519 *outpos += repsize;
8520 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008522 Py_DECREF(rep);
8523 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524}
8525
8526/* handle an error in PyUnicode_EncodeCharmap
8527 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008528static int
8529charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008530 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008532 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008533 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534{
8535 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008537 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008538 enum PyUnicode_Kind kind;
8539 void *data;
8540 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008542 Py_ssize_t collstartpos = *inpos;
8543 Py_ssize_t collendpos = *inpos+1;
8544 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008545 const char *encoding = "charmap";
8546 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008547 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008548 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008549 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550
Benjamin Petersonbac79492012-01-14 13:34:47 -05008551 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008552 return -1;
8553 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 /* find all unencodable characters */
8555 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008556 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008557 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008559 val = encoding_map_lookup(ch, mapping);
8560 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 break;
8562 ++collendpos;
8563 continue;
8564 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008565
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8567 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 if (rep==NULL)
8569 return -1;
8570 else if (rep!=Py_None) {
8571 Py_DECREF(rep);
8572 break;
8573 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 }
8577 /* cache callback name lookup
8578 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008579 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008580 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008581
8582 switch (*error_handler) {
8583 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008584 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008585 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008586
8587 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008588 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 x = charmapencode_output('?', mapping, res, respos);
8590 if (x==enc_EXCEPTION) {
8591 return -1;
8592 }
8593 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008594 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return -1;
8596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008597 }
8598 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008599 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 *inpos = collendpos;
8601 break;
Victor Stinner50149202015-09-22 00:26:54 +02008602
8603 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008604 /* generate replacement (temporarily (mis)uses p) */
8605 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 char buffer[2+29+1+1];
8607 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 for (cp = buffer; *cp; ++cp) {
8610 x = charmapencode_output(*cp, mapping, res, respos);
8611 if (x==enc_EXCEPTION)
8612 return -1;
8613 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008614 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 return -1;
8616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008617 }
8618 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008619 *inpos = collendpos;
8620 break;
Victor Stinner50149202015-09-22 00:26:54 +02008621
Benjamin Peterson14339b62009-01-31 16:36:08 +00008622 default:
Victor Stinner50149202015-09-22 00:26:54 +02008623 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008624 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008626 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008628 if (PyBytes_Check(repunicode)) {
8629 /* Directly copy bytes result to output. */
8630 Py_ssize_t outsize = PyBytes_Size(*res);
8631 Py_ssize_t requiredsize;
8632 repsize = PyBytes_Size(repunicode);
8633 requiredsize = *respos + repsize;
8634 if (requiredsize > outsize)
8635 /* Make room for all additional bytes. */
8636 if (charmapencode_resize(res, respos, requiredsize)) {
8637 Py_DECREF(repunicode);
8638 return -1;
8639 }
8640 memcpy(PyBytes_AsString(*res) + *respos,
8641 PyBytes_AsString(repunicode), repsize);
8642 *respos += repsize;
8643 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008644 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008645 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008646 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008647 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008648 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008649 Py_DECREF(repunicode);
8650 return -1;
8651 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008652 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008653 data = PyUnicode_DATA(repunicode);
8654 kind = PyUnicode_KIND(repunicode);
8655 for (index = 0; index < repsize; index++) {
8656 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8657 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008659 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 return -1;
8661 }
8662 else if (x==enc_FAILED) {
8663 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008664 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 return -1;
8666 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008667 }
8668 *inpos = newpos;
8669 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 }
8671 return 0;
8672}
8673
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008675_PyUnicode_EncodeCharmap(PyObject *unicode,
8676 PyObject *mapping,
8677 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 /* output object */
8680 PyObject *res = NULL;
8681 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008682 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008683 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008685 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008686 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008688 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008689 void *data;
8690 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691
Benjamin Petersonbac79492012-01-14 13:34:47 -05008692 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008693 return NULL;
8694 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008695 data = PyUnicode_DATA(unicode);
8696 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008697
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 /* Default to Latin-1 */
8699 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008700 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 /* allocate enough for a simple encoding without
8703 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008704 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 if (res == NULL)
8706 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008707 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008711 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008713 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 if (x==enc_EXCEPTION) /* error */
8715 goto onError;
8716 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008717 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008719 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 &res, &respos)) {
8721 goto onError;
8722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008723 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 else
8725 /* done with this character => adjust input position */
8726 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008730 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008731 if (_PyBytes_Resize(&res, respos) < 0)
8732 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008734 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008735 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 return res;
8737
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739 Py_XDECREF(res);
8740 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008741 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742 return NULL;
8743}
8744
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008745/* Deprecated */
8746PyObject *
8747PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8748 Py_ssize_t size,
8749 PyObject *mapping,
8750 const char *errors)
8751{
8752 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008753 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008754 if (unicode == NULL)
8755 return NULL;
8756 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8757 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008758 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008759}
8760
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761PyObject *
8762PyUnicode_AsCharmapString(PyObject *unicode,
8763 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764{
8765 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 PyErr_BadArgument();
8767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008769 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770}
8771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008773static void
8774make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008776 Py_ssize_t startpos, Py_ssize_t endpos,
8777 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 *exceptionObject = _PyUnicodeTranslateError_Create(
8781 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 }
8783 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8785 goto onError;
8786 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8787 goto onError;
8788 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8789 goto onError;
8790 return;
8791 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008792 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793 }
8794}
8795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008796/* error handling callback helper:
8797 build arguments, call the callback and check the arguments,
8798 put the result into newpos and return the replacement string, which
8799 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008800static PyObject *
8801unicode_translate_call_errorhandler(const char *errors,
8802 PyObject **errorHandler,
8803 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008805 Py_ssize_t startpos, Py_ssize_t endpos,
8806 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008808 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008809
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008810 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811 PyObject *restuple;
8812 PyObject *resunicode;
8813
8814 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818 }
8819
8820 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008824
Petr Viktorinffd97532020-02-11 17:46:57 +01008825 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008829 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 Py_DECREF(restuple);
8831 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008833 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 &resunicode, &i_newpos)) {
8835 Py_DECREF(restuple);
8836 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008837 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008838 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008840 else
8841 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008843 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 Py_DECREF(restuple);
8845 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008846 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008847 Py_INCREF(resunicode);
8848 Py_DECREF(restuple);
8849 return resunicode;
8850}
8851
8852/* Lookup the character ch in the mapping and put the result in result,
8853 which must be decrefed by the caller.
8854 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008855static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008857{
Christian Heimes217cfd12007-12-02 14:31:20 +00008858 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008859 PyObject *x;
8860
8861 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008863 x = PyObject_GetItem(mapping, w);
8864 Py_DECREF(w);
8865 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8867 /* No mapping found means: use 1:1 mapping. */
8868 PyErr_Clear();
8869 *result = NULL;
8870 return 0;
8871 } else
8872 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008873 }
8874 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 *result = x;
8876 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008877 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008878 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008880 if (value < 0 || value > MAX_UNICODE) {
8881 PyErr_Format(PyExc_ValueError,
8882 "character mapping must be in range(0x%x)",
8883 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 Py_DECREF(x);
8885 return -1;
8886 }
8887 *result = x;
8888 return 0;
8889 }
8890 else if (PyUnicode_Check(x)) {
8891 *result = x;
8892 return 0;
8893 }
8894 else {
8895 /* wrong return value */
8896 PyErr_SetString(PyExc_TypeError,
8897 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008898 Py_DECREF(x);
8899 return -1;
8900 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008901}
Victor Stinner1194ea02014-04-04 19:37:40 +02008902
8903/* lookup the character, write the result into the writer.
8904 Return 1 if the result was written into the writer, return 0 if the mapping
8905 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008906static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008907charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8908 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008909{
Victor Stinner1194ea02014-04-04 19:37:40 +02008910 PyObject *item;
8911
8912 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008914
8915 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008917 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008920 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008921 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008922
8923 if (item == Py_None) {
8924 Py_DECREF(item);
8925 return 0;
8926 }
8927
8928 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008929 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8930 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8931 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008932 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8933 Py_DECREF(item);
8934 return -1;
8935 }
8936 Py_DECREF(item);
8937 return 1;
8938 }
8939
8940 if (!PyUnicode_Check(item)) {
8941 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008943 }
8944
8945 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8946 Py_DECREF(item);
8947 return -1;
8948 }
8949
8950 Py_DECREF(item);
8951 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008952}
8953
Victor Stinner89a76ab2014-04-05 11:44:04 +02008954static int
8955unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8956 Py_UCS1 *translate)
8957{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008958 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008959 int ret = 0;
8960
Victor Stinner89a76ab2014-04-05 11:44:04 +02008961 if (charmaptranslate_lookup(ch, mapping, &item)) {
8962 return -1;
8963 }
8964
8965 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008966 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008967 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008969 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008970 /* not found => default to 1:1 mapping */
8971 translate[ch] = ch;
8972 return 1;
8973 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008974 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008975 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008976 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8977 used it */
8978 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008979 /* invalid character or character outside ASCII:
8980 skip the fast translate */
8981 goto exit;
8982 }
8983 translate[ch] = (Py_UCS1)replace;
8984 }
8985 else if (PyUnicode_Check(item)) {
8986 Py_UCS4 replace;
8987
8988 if (PyUnicode_READY(item) == -1) {
8989 Py_DECREF(item);
8990 return -1;
8991 }
8992 if (PyUnicode_GET_LENGTH(item) != 1)
8993 goto exit;
8994
8995 replace = PyUnicode_READ_CHAR(item, 0);
8996 if (replace > 127)
8997 goto exit;
8998 translate[ch] = (Py_UCS1)replace;
8999 }
9000 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009001 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009002 goto exit;
9003 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009004 ret = 1;
9005
Benjamin Peterson1365de72014-04-07 20:15:41 -04009006 exit:
9007 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008 return ret;
9009}
9010
9011/* Fast path for ascii => ascii translation. Return 1 if the whole string
9012 was translated into writer, return 0 if the input string was partially
9013 translated into writer, raise an exception and return -1 on error. */
9014static int
9015unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009016 _PyUnicodeWriter *writer, int ignore,
9017 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009018{
Victor Stinner872b2912014-04-05 14:27:07 +02009019 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009020 Py_ssize_t len;
9021 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009022 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009023
Victor Stinner89a76ab2014-04-05 11:44:04 +02009024 len = PyUnicode_GET_LENGTH(input);
9025
Victor Stinner872b2912014-04-05 14:27:07 +02009026 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027
9028 in = PyUnicode_1BYTE_DATA(input);
9029 end = in + len;
9030
9031 assert(PyUnicode_IS_ASCII(writer->buffer));
9032 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9033 out = PyUnicode_1BYTE_DATA(writer->buffer);
9034
Victor Stinner872b2912014-04-05 14:27:07 +02009035 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009036 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009037 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009038 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009039 int translate = unicode_fast_translate_lookup(mapping, ch,
9040 ascii_table);
9041 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009042 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009043 if (translate == 0)
9044 goto exit;
9045 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009046 }
Victor Stinner872b2912014-04-05 14:27:07 +02009047 if (ch2 == 0xfe) {
9048 if (ignore)
9049 continue;
9050 goto exit;
9051 }
9052 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009053 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009054 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009055 }
Victor Stinner872b2912014-04-05 14:27:07 +02009056 res = 1;
9057
9058exit:
9059 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009060 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009061 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009062}
9063
Victor Stinner3222da22015-10-01 22:07:32 +02009064static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065_PyUnicode_TranslateCharmap(PyObject *input,
9066 PyObject *mapping,
9067 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009070 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 Py_ssize_t size, i;
9072 int kind;
9073 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009074 _PyUnicodeWriter writer;
9075 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009076 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009077 PyObject *errorHandler = NULL;
9078 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009079 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009080 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009081
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 PyErr_BadArgument();
9084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 if (PyUnicode_READY(input) == -1)
9088 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009089 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 kind = PyUnicode_KIND(input);
9091 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009093 if (size == 0)
9094 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009096 /* allocate enough for a simple 1:1 translation without
9097 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009098 _PyUnicodeWriter_Init(&writer);
9099 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101
Victor Stinner872b2912014-04-05 14:27:07 +02009102 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9103
Victor Stinner33798672016-03-01 21:59:58 +01009104 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009105 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009106 if (PyUnicode_IS_ASCII(input)) {
9107 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9108 if (res < 0) {
9109 _PyUnicodeWriter_Dealloc(&writer);
9110 return NULL;
9111 }
9112 if (res == 1)
9113 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009114 }
Victor Stinner33798672016-03-01 21:59:58 +01009115 else {
9116 i = 0;
9117 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009121 int translate;
9122 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9123 Py_ssize_t newpos;
9124 /* startpos for collecting untranslatable chars */
9125 Py_ssize_t collstart;
9126 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009127 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128
Victor Stinner1194ea02014-04-04 19:37:40 +02009129 ch = PyUnicode_READ(kind, data, i);
9130 translate = charmaptranslate_output(ch, mapping, &writer);
9131 if (translate < 0)
9132 goto onError;
9133
9134 if (translate != 0) {
9135 /* it worked => adjust input pointer */
9136 ++i;
9137 continue;
9138 }
9139
9140 /* untranslatable character */
9141 collstart = i;
9142 collend = i+1;
9143
9144 /* find all untranslatable characters */
9145 while (collend < size) {
9146 PyObject *x;
9147 ch = PyUnicode_READ(kind, data, collend);
9148 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009149 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009150 Py_XDECREF(x);
9151 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009153 ++collend;
9154 }
9155
9156 if (ignore) {
9157 i = collend;
9158 }
9159 else {
9160 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9161 reason, input, &exc,
9162 collstart, collend, &newpos);
9163 if (repunicode == NULL)
9164 goto onError;
9165 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009167 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009168 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009169 Py_DECREF(repunicode);
9170 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009171 }
9172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009173 Py_XDECREF(exc);
9174 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009175 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009178 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009179 Py_XDECREF(exc);
9180 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 return NULL;
9182}
9183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184/* Deprecated. Use PyUnicode_Translate instead. */
9185PyObject *
9186PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9187 Py_ssize_t size,
9188 PyObject *mapping,
9189 const char *errors)
9190{
Christian Heimes5f520f42012-09-11 14:03:25 +02009191 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009192 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 if (!unicode)
9194 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009195 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9196 Py_DECREF(unicode);
9197 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198}
9199
Alexander Belopolsky40018472011-02-26 01:02:56 +00009200PyObject *
9201PyUnicode_Translate(PyObject *str,
9202 PyObject *mapping,
9203 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009204{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009205 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009206 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009207 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208}
Tim Petersced69f82003-09-16 20:30:58 +00009209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210PyObject *
9211_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9212{
9213 if (!PyUnicode_Check(unicode)) {
9214 PyErr_BadInternalCall();
9215 return NULL;
9216 }
9217 if (PyUnicode_READY(unicode) == -1)
9218 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009219 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 /* If the string is already ASCII, just return the same string */
9221 Py_INCREF(unicode);
9222 return unicode;
9223 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009224
9225 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9226 PyObject *result = PyUnicode_New(len, 127);
9227 if (result == NULL) {
9228 return NULL;
9229 }
9230
9231 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9232 int kind = PyUnicode_KIND(unicode);
9233 const void *data = PyUnicode_DATA(unicode);
9234 Py_ssize_t i;
9235 for (i = 0; i < len; ++i) {
9236 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9237 if (ch < 127) {
9238 out[i] = ch;
9239 }
9240 else if (Py_UNICODE_ISSPACE(ch)) {
9241 out[i] = ' ';
9242 }
9243 else {
9244 int decimal = Py_UNICODE_TODECIMAL(ch);
9245 if (decimal < 0) {
9246 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009247 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009248 _PyUnicode_LENGTH(result) = i + 1;
9249 break;
9250 }
9251 out[i] = '0' + decimal;
9252 }
9253 }
9254
INADA Naoki16dfca42018-07-14 12:06:43 +09009255 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009256 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257}
9258
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009259PyObject *
9260PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9261 Py_ssize_t length)
9262{
Victor Stinnerf0124502011-11-21 23:12:56 +01009263 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009264 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009265 Py_UCS4 maxchar;
9266 enum PyUnicode_Kind kind;
9267 void *data;
9268
Victor Stinner99d7ad02012-02-22 13:37:39 +01009269 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009270 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009271 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009272 if (ch > 127) {
9273 int decimal = Py_UNICODE_TODECIMAL(ch);
9274 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009275 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009276 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009277 }
9278 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009279
9280 /* Copy to a new string */
9281 decimal = PyUnicode_New(length, maxchar);
9282 if (decimal == NULL)
9283 return decimal;
9284 kind = PyUnicode_KIND(decimal);
9285 data = PyUnicode_DATA(decimal);
9286 /* Iterate over code points */
9287 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009288 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009289 if (ch > 127) {
9290 int decimal = Py_UNICODE_TODECIMAL(ch);
9291 if (decimal >= 0)
9292 ch = '0' + decimal;
9293 }
9294 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009296 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009297}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009298/* --- Decimal Encoder ---------------------------------------------------- */
9299
Alexander Belopolsky40018472011-02-26 01:02:56 +00009300int
9301PyUnicode_EncodeDecimal(Py_UNICODE *s,
9302 Py_ssize_t length,
9303 char *output,
9304 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009305{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009306 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009307 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009308 enum PyUnicode_Kind kind;
9309 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009310
9311 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 PyErr_BadArgument();
9313 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009314 }
9315
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009316 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009317 if (unicode == NULL)
9318 return -1;
9319
Victor Stinner42bf7752011-11-21 22:52:58 +01009320 kind = PyUnicode_KIND(unicode);
9321 data = PyUnicode_DATA(unicode);
9322
Victor Stinnerb84d7232011-11-22 01:50:07 +01009323 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009324 PyObject *exc;
9325 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009327 Py_ssize_t startpos;
9328
9329 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009330
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009332 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009333 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009335 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009336 decimal = Py_UNICODE_TODECIMAL(ch);
9337 if (decimal >= 0) {
9338 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009339 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009340 continue;
9341 }
9342 if (0 < ch && ch < 256) {
9343 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009344 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 continue;
9346 }
Victor Stinner6345be92011-11-25 20:09:01 +01009347
Victor Stinner42bf7752011-11-21 22:52:58 +01009348 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009349 exc = NULL;
9350 raise_encode_exception(&exc, "decimal", unicode,
9351 startpos, startpos+1,
9352 "invalid decimal Unicode string");
9353 Py_XDECREF(exc);
9354 Py_DECREF(unicode);
9355 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009356 }
9357 /* 0-terminate the output string */
9358 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009359 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009360 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009361}
9362
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363/* --- Helpers ------------------------------------------------------------ */
9364
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009365/* helper macro to fixup start/end slice values */
9366#define ADJUST_INDICES(start, end, len) \
9367 if (end > len) \
9368 end = len; \
9369 else if (end < 0) { \
9370 end += len; \
9371 if (end < 0) \
9372 end = 0; \
9373 } \
9374 if (start < 0) { \
9375 start += len; \
9376 if (start < 0) \
9377 start = 0; \
9378 }
9379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009381any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009383 Py_ssize_t end,
9384 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009386 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 void *buf1, *buf2;
9388 Py_ssize_t len1, len2, result;
9389
9390 kind1 = PyUnicode_KIND(s1);
9391 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009392 if (kind1 < kind2)
9393 return -1;
9394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 len1 = PyUnicode_GET_LENGTH(s1);
9396 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009397 ADJUST_INDICES(start, end, len1);
9398 if (end - start < len2)
9399 return -1;
9400
9401 buf1 = PyUnicode_DATA(s1);
9402 buf2 = PyUnicode_DATA(s2);
9403 if (len2 == 1) {
9404 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9405 result = findchar((const char *)buf1 + kind1*start,
9406 kind1, end - start, ch, direction);
9407 if (result == -1)
9408 return -1;
9409 else
9410 return start + result;
9411 }
9412
9413 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009414 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009415 if (!buf2)
9416 return -2;
9417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418
Victor Stinner794d5672011-10-10 03:21:36 +02009419 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009420 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009421 case PyUnicode_1BYTE_KIND:
9422 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9423 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9424 else
9425 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9426 break;
9427 case PyUnicode_2BYTE_KIND:
9428 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9429 break;
9430 case PyUnicode_4BYTE_KIND:
9431 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9432 break;
9433 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009434 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009435 }
9436 }
9437 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009438 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009439 case PyUnicode_1BYTE_KIND:
9440 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9441 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9442 else
9443 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9444 break;
9445 case PyUnicode_2BYTE_KIND:
9446 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9447 break;
9448 case PyUnicode_4BYTE_KIND:
9449 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9450 break;
9451 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009452 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 }
9455
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009456 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 PyMem_Free(buf2);
9458
9459 return result;
9460}
9461
Victor Stinner59423e32018-11-26 13:40:01 +01009462/* _PyUnicode_InsertThousandsGrouping() helper functions */
9463#include "stringlib/localeutil.h"
9464
9465/**
9466 * InsertThousandsGrouping:
9467 * @writer: Unicode writer.
9468 * @n_buffer: Number of characters in @buffer.
9469 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9470 * @d_pos: Start of digits string.
9471 * @n_digits: The number of digits in the string, in which we want
9472 * to put the grouping chars.
9473 * @min_width: The minimum width of the digits in the output string.
9474 * Output will be zero-padded on the left to fill.
9475 * @grouping: see definition in localeconv().
9476 * @thousands_sep: see definition in localeconv().
9477 *
9478 * There are 2 modes: counting and filling. If @writer is NULL,
9479 * we are in counting mode, else filling mode.
9480 * If counting, the required buffer size is returned.
9481 * If filling, we know the buffer will be large enough, so we don't
9482 * need to pass in the buffer size.
9483 * Inserts thousand grouping characters (as defined by grouping and
9484 * thousands_sep) into @writer.
9485 *
9486 * Return value: -1 on error, number of characters otherwise.
9487 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009489_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009490 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009491 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009492 PyObject *digits,
9493 Py_ssize_t d_pos,
9494 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009495 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009496 const char *grouping,
9497 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009498 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499{
Xtreak3f7983a2019-01-07 20:39:14 +05309500 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009501 if (writer) {
9502 assert(digits != NULL);
9503 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009504 }
9505 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009506 assert(digits == NULL);
9507 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009508 }
Victor Stinner59423e32018-11-26 13:40:01 +01009509 assert(0 <= d_pos);
9510 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009511 assert(grouping != NULL);
9512
9513 if (digits != NULL) {
9514 if (PyUnicode_READY(digits) == -1) {
9515 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009516 }
Victor Stinner59423e32018-11-26 13:40:01 +01009517 }
9518 if (PyUnicode_READY(thousands_sep) == -1) {
9519 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009520 }
9521
Victor Stinner59423e32018-11-26 13:40:01 +01009522 Py_ssize_t count = 0;
9523 Py_ssize_t n_zeros;
9524 int loop_broken = 0;
9525 int use_separator = 0; /* First time through, don't append the
9526 separator. They only go between
9527 groups. */
9528 Py_ssize_t buffer_pos;
9529 Py_ssize_t digits_pos;
9530 Py_ssize_t len;
9531 Py_ssize_t n_chars;
9532 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9533 be looked at */
9534 /* A generator that returns all of the grouping widths, until it
9535 returns 0. */
9536 GroupGenerator groupgen;
9537 GroupGenerator_init(&groupgen, grouping);
9538 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9539
9540 /* if digits are not grouped, thousands separator
9541 should be an empty string */
9542 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9543
9544 digits_pos = d_pos + n_digits;
9545 if (writer) {
9546 buffer_pos = writer->pos + n_buffer;
9547 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9548 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 }
Victor Stinner59423e32018-11-26 13:40:01 +01009550 else {
9551 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009552 }
Victor Stinner59423e32018-11-26 13:40:01 +01009553
9554 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009555 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009556 }
Victor Stinner59423e32018-11-26 13:40:01 +01009557
9558 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9559 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9560 n_zeros = Py_MAX(0, len - remaining);
9561 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9562
9563 /* Use n_zero zero's and n_chars chars */
9564
9565 /* Count only, don't do anything. */
9566 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9567
9568 /* Copy into the writer. */
9569 InsertThousandsGrouping_fill(writer, &buffer_pos,
9570 digits, &digits_pos,
9571 n_chars, n_zeros,
9572 use_separator ? thousands_sep : NULL,
9573 thousands_sep_len, maxchar);
9574
9575 /* Use a separator next time. */
9576 use_separator = 1;
9577
9578 remaining -= n_chars;
9579 min_width -= len;
9580
9581 if (remaining <= 0 && min_width <= 0) {
9582 loop_broken = 1;
9583 break;
9584 }
9585 min_width -= thousands_sep_len;
9586 }
9587 if (!loop_broken) {
9588 /* We left the loop without using a break statement. */
9589
9590 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9591 n_zeros = Py_MAX(0, len - remaining);
9592 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9593
9594 /* Use n_zero zero's and n_chars chars */
9595 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9596
9597 /* Copy into the writer. */
9598 InsertThousandsGrouping_fill(writer, &buffer_pos,
9599 digits, &digits_pos,
9600 n_chars, n_zeros,
9601 use_separator ? thousands_sep : NULL,
9602 thousands_sep_len, maxchar);
9603 }
9604 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605}
9606
9607
Alexander Belopolsky40018472011-02-26 01:02:56 +00009608Py_ssize_t
9609PyUnicode_Count(PyObject *str,
9610 PyObject *substr,
9611 Py_ssize_t start,
9612 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009614 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009615 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 void *buf1 = NULL, *buf2 = NULL;
9617 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009618
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009619 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009620 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009621
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009622 kind1 = PyUnicode_KIND(str);
9623 kind2 = PyUnicode_KIND(substr);
9624 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009625 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009626
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009627 len1 = PyUnicode_GET_LENGTH(str);
9628 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009630 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009631 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009632
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009633 buf1 = PyUnicode_DATA(str);
9634 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009635 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009636 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009637 if (!buf2)
9638 goto onError;
9639 }
9640
9641 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009643 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009644 result = asciilib_count(
9645 ((Py_UCS1*)buf1) + start, end - start,
9646 buf2, len2, PY_SSIZE_T_MAX
9647 );
9648 else
9649 result = ucs1lib_count(
9650 ((Py_UCS1*)buf1) + start, end - start,
9651 buf2, len2, PY_SSIZE_T_MAX
9652 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 break;
9654 case PyUnicode_2BYTE_KIND:
9655 result = ucs2lib_count(
9656 ((Py_UCS2*)buf1) + start, end - start,
9657 buf2, len2, PY_SSIZE_T_MAX
9658 );
9659 break;
9660 case PyUnicode_4BYTE_KIND:
9661 result = ucs4lib_count(
9662 ((Py_UCS4*)buf1) + start, end - start,
9663 buf2, len2, PY_SSIZE_T_MAX
9664 );
9665 break;
9666 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009667 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009669
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009670 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 PyMem_Free(buf2);
9672
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009675 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 PyMem_Free(buf2);
9677 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678}
9679
Alexander Belopolsky40018472011-02-26 01:02:56 +00009680Py_ssize_t
9681PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009682 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009683 Py_ssize_t start,
9684 Py_ssize_t end,
9685 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009687 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009688 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009689
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009690 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691}
9692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693Py_ssize_t
9694PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9695 Py_ssize_t start, Py_ssize_t end,
9696 int direction)
9697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009699 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 if (PyUnicode_READY(str) == -1)
9701 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009702 len = PyUnicode_GET_LENGTH(str);
9703 ADJUST_INDICES(start, end, len);
9704 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009705 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009707 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9708 kind, end-start, ch, direction);
9709 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009711 else
9712 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713}
9714
Alexander Belopolsky40018472011-02-26 01:02:56 +00009715static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009716tailmatch(PyObject *self,
9717 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009718 Py_ssize_t start,
9719 Py_ssize_t end,
9720 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 int kind_self;
9723 int kind_sub;
9724 void *data_self;
9725 void *data_sub;
9726 Py_ssize_t offset;
9727 Py_ssize_t i;
9728 Py_ssize_t end_sub;
9729
9730 if (PyUnicode_READY(self) == -1 ||
9731 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009732 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9735 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009737 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009739 if (PyUnicode_GET_LENGTH(substring) == 0)
9740 return 1;
9741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 kind_self = PyUnicode_KIND(self);
9743 data_self = PyUnicode_DATA(self);
9744 kind_sub = PyUnicode_KIND(substring);
9745 data_sub = PyUnicode_DATA(substring);
9746 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9747
9748 if (direction > 0)
9749 offset = end;
9750 else
9751 offset = start;
9752
9753 if (PyUnicode_READ(kind_self, data_self, offset) ==
9754 PyUnicode_READ(kind_sub, data_sub, 0) &&
9755 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9756 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9757 /* If both are of the same kind, memcmp is sufficient */
9758 if (kind_self == kind_sub) {
9759 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009760 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 data_sub,
9762 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009763 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009765 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 else {
9767 /* We do not need to compare 0 and len(substring)-1 because
9768 the if statement above ensured already that they are equal
9769 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 for (i = 1; i < end_sub; ++i) {
9771 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9772 PyUnicode_READ(kind_sub, data_sub, i))
9773 return 0;
9774 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009775 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777 }
9778
9779 return 0;
9780}
9781
Alexander Belopolsky40018472011-02-26 01:02:56 +00009782Py_ssize_t
9783PyUnicode_Tailmatch(PyObject *str,
9784 PyObject *substr,
9785 Py_ssize_t start,
9786 Py_ssize_t end,
9787 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009789 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009790 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009791
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009792 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793}
9794
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009795static PyObject *
9796ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009798 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9799 char *resdata, *data = PyUnicode_DATA(self);
9800 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009801
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009802 res = PyUnicode_New(len, 127);
9803 if (res == NULL)
9804 return NULL;
9805 resdata = PyUnicode_DATA(res);
9806 if (lower)
9807 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009809 _Py_bytes_upper(resdata, data, len);
9810 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811}
9812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009814handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 Py_ssize_t j;
9817 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009818 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009820
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009821 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9822
9823 where ! is a negation and \p{xxx} is a character with property xxx.
9824 */
9825 for (j = i - 1; j >= 0; j--) {
9826 c = PyUnicode_READ(kind, data, j);
9827 if (!_PyUnicode_IsCaseIgnorable(c))
9828 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009830 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9831 if (final_sigma) {
9832 for (j = i + 1; j < length; j++) {
9833 c = PyUnicode_READ(kind, data, j);
9834 if (!_PyUnicode_IsCaseIgnorable(c))
9835 break;
9836 }
9837 final_sigma = j == length || !_PyUnicode_IsCased(c);
9838 }
9839 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840}
9841
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009842static int
9843lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9844 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009846 /* Obscure special case. */
9847 if (c == 0x3A3) {
9848 mapped[0] = handle_capital_sigma(kind, data, length, i);
9849 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009851 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852}
9853
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009854static Py_ssize_t
9855do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009857 Py_ssize_t i, k = 0;
9858 int n_res, j;
9859 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009860
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009862 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009863 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009864 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009865 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009867 for (i = 1; i < length; i++) {
9868 c = PyUnicode_READ(kind, data, i);
9869 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9870 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009871 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009873 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009874 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876}
9877
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878static Py_ssize_t
9879do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9880 Py_ssize_t i, k = 0;
9881
9882 for (i = 0; i < length; i++) {
9883 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9884 int n_res, j;
9885 if (Py_UNICODE_ISUPPER(c)) {
9886 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9887 }
9888 else if (Py_UNICODE_ISLOWER(c)) {
9889 n_res = _PyUnicode_ToUpperFull(c, mapped);
9890 }
9891 else {
9892 n_res = 1;
9893 mapped[0] = c;
9894 }
9895 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009896 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009897 res[k++] = mapped[j];
9898 }
9899 }
9900 return k;
9901}
9902
9903static Py_ssize_t
9904do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9905 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009907 Py_ssize_t i, k = 0;
9908
9909 for (i = 0; i < length; i++) {
9910 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9911 int n_res, j;
9912 if (lower)
9913 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9914 else
9915 n_res = _PyUnicode_ToUpperFull(c, mapped);
9916 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009917 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009918 res[k++] = mapped[j];
9919 }
9920 }
9921 return k;
9922}
9923
9924static Py_ssize_t
9925do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9926{
9927 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9928}
9929
9930static Py_ssize_t
9931do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9932{
9933 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9934}
9935
Benjamin Petersone51757f2012-01-12 21:10:29 -05009936static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009937do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9938{
9939 Py_ssize_t i, k = 0;
9940
9941 for (i = 0; i < length; i++) {
9942 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9943 Py_UCS4 mapped[3];
9944 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9945 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009946 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009947 res[k++] = mapped[j];
9948 }
9949 }
9950 return k;
9951}
9952
9953static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009954do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9955{
9956 Py_ssize_t i, k = 0;
9957 int previous_is_cased;
9958
9959 previous_is_cased = 0;
9960 for (i = 0; i < length; i++) {
9961 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9962 Py_UCS4 mapped[3];
9963 int n_res, j;
9964
9965 if (previous_is_cased)
9966 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9967 else
9968 n_res = _PyUnicode_ToTitleFull(c, mapped);
9969
9970 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009971 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009972 res[k++] = mapped[j];
9973 }
9974
9975 previous_is_cased = _PyUnicode_IsCased(c);
9976 }
9977 return k;
9978}
9979
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009980static PyObject *
9981case_operation(PyObject *self,
9982 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9983{
9984 PyObject *res = NULL;
9985 Py_ssize_t length, newlength = 0;
9986 int kind, outkind;
9987 void *data, *outdata;
9988 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9989
Benjamin Petersoneea48462012-01-16 14:28:50 -05009990 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009991
9992 kind = PyUnicode_KIND(self);
9993 data = PyUnicode_DATA(self);
9994 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009995 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009996 PyErr_SetString(PyExc_OverflowError, "string is too long");
9997 return NULL;
9998 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009999 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010000 if (tmp == NULL)
10001 return PyErr_NoMemory();
10002 newlength = perform(kind, data, length, tmp, &maxchar);
10003 res = PyUnicode_New(newlength, maxchar);
10004 if (res == NULL)
10005 goto leave;
10006 tmpend = tmp + newlength;
10007 outdata = PyUnicode_DATA(res);
10008 outkind = PyUnicode_KIND(res);
10009 switch (outkind) {
10010 case PyUnicode_1BYTE_KIND:
10011 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10012 break;
10013 case PyUnicode_2BYTE_KIND:
10014 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10015 break;
10016 case PyUnicode_4BYTE_KIND:
10017 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10018 break;
10019 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010020 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010021 }
10022 leave:
10023 PyMem_FREE(tmp);
10024 return res;
10025}
10026
Tim Peters8ce9f162004-08-27 01:49:32 +000010027PyObject *
10028PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010030 PyObject *res;
10031 PyObject *fseq;
10032 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010033 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010035 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010036 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010037 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010038 }
10039
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010040 /* NOTE: the following code can't call back into Python code,
10041 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010042 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010043
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010044 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010045 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010046 res = _PyUnicode_JoinArray(separator, items, seqlen);
10047 Py_DECREF(fseq);
10048 return res;
10049}
10050
10051PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010052_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010053{
10054 PyObject *res = NULL; /* the result */
10055 PyObject *sep = NULL;
10056 Py_ssize_t seplen;
10057 PyObject *item;
10058 Py_ssize_t sz, i, res_offset;
10059 Py_UCS4 maxchar;
10060 Py_UCS4 item_maxchar;
10061 int use_memcpy;
10062 unsigned char *res_data = NULL, *sep_data = NULL;
10063 PyObject *last_obj;
10064 unsigned int kind = 0;
10065
Tim Peters05eba1f2004-08-27 21:32:02 +000010066 /* If empty sequence, return u"". */
10067 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010068 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010069 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010070
Tim Peters05eba1f2004-08-27 21:32:02 +000010071 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010073 if (seqlen == 1) {
10074 if (PyUnicode_CheckExact(items[0])) {
10075 res = items[0];
10076 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010077 return res;
10078 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010079 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010080 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010081 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010082 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010083 /* Set up sep and seplen */
10084 if (separator == NULL) {
10085 /* fall back to a blank space separator */
10086 sep = PyUnicode_FromOrdinal(' ');
10087 if (!sep)
10088 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010089 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010090 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010091 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010092 else {
10093 if (!PyUnicode_Check(separator)) {
10094 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010095 "separator: expected str instance,"
10096 " %.80s found",
10097 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010098 goto onError;
10099 }
10100 if (PyUnicode_READY(separator))
10101 goto onError;
10102 sep = separator;
10103 seplen = PyUnicode_GET_LENGTH(separator);
10104 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10105 /* inc refcount to keep this code path symmetric with the
10106 above case of a blank separator */
10107 Py_INCREF(sep);
10108 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010109 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010110 }
10111
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010112 /* There are at least two things to join, or else we have a subclass
10113 * of str in the sequence.
10114 * Do a pre-pass to figure out the total amount of space we'll
10115 * need (sz), and see whether all argument are strings.
10116 */
10117 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010118#ifdef Py_DEBUG
10119 use_memcpy = 0;
10120#else
10121 use_memcpy = 1;
10122#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010123 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010124 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010125 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010126 if (!PyUnicode_Check(item)) {
10127 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010128 "sequence item %zd: expected str instance,"
10129 " %.80s found",
10130 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 goto onError;
10132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 if (PyUnicode_READY(item) == -1)
10134 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010135 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010137 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010138 if (i != 0) {
10139 add_sz += seplen;
10140 }
10141 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010142 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010144 goto onError;
10145 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010146 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010147 if (use_memcpy && last_obj != NULL) {
10148 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10149 use_memcpy = 0;
10150 }
10151 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010152 }
Tim Petersced69f82003-09-16 20:30:58 +000010153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010155 if (res == NULL)
10156 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010157
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010158 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010159#ifdef Py_DEBUG
10160 use_memcpy = 0;
10161#else
10162 if (use_memcpy) {
10163 res_data = PyUnicode_1BYTE_DATA(res);
10164 kind = PyUnicode_KIND(res);
10165 if (seplen != 0)
10166 sep_data = PyUnicode_1BYTE_DATA(sep);
10167 }
10168#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010169 if (use_memcpy) {
10170 for (i = 0; i < seqlen; ++i) {
10171 Py_ssize_t itemlen;
10172 item = items[i];
10173
10174 /* Copy item, and maybe the separator. */
10175 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010176 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010177 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010178 kind * seplen);
10179 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010180 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010181
10182 itemlen = PyUnicode_GET_LENGTH(item);
10183 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010184 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010185 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010186 kind * itemlen);
10187 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010188 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010189 }
10190 assert(res_data == PyUnicode_1BYTE_DATA(res)
10191 + kind * PyUnicode_GET_LENGTH(res));
10192 }
10193 else {
10194 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10195 Py_ssize_t itemlen;
10196 item = items[i];
10197
10198 /* Copy item, and maybe the separator. */
10199 if (i && seplen != 0) {
10200 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10201 res_offset += seplen;
10202 }
10203
10204 itemlen = PyUnicode_GET_LENGTH(item);
10205 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010206 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010207 res_offset += itemlen;
10208 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010209 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010210 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010211 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010214 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010219 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220 return NULL;
10221}
10222
Victor Stinnerd3f08822012-05-29 12:57:52 +020010223void
10224_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10225 Py_UCS4 fill_char)
10226{
10227 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010228 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010229 assert(PyUnicode_IS_READY(unicode));
10230 assert(unicode_modifiable(unicode));
10231 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10232 assert(start >= 0);
10233 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010234 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010235}
10236
Victor Stinner3fe55312012-01-04 00:33:50 +010010237Py_ssize_t
10238PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10239 Py_UCS4 fill_char)
10240{
10241 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010242
10243 if (!PyUnicode_Check(unicode)) {
10244 PyErr_BadInternalCall();
10245 return -1;
10246 }
10247 if (PyUnicode_READY(unicode) == -1)
10248 return -1;
10249 if (unicode_check_modifiable(unicode))
10250 return -1;
10251
Victor Stinnerd3f08822012-05-29 12:57:52 +020010252 if (start < 0) {
10253 PyErr_SetString(PyExc_IndexError, "string index out of range");
10254 return -1;
10255 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010256 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10257 PyErr_SetString(PyExc_ValueError,
10258 "fill character is bigger than "
10259 "the string maximum character");
10260 return -1;
10261 }
10262
10263 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10264 length = Py_MIN(maxlen, length);
10265 if (length <= 0)
10266 return 0;
10267
Victor Stinnerd3f08822012-05-29 12:57:52 +020010268 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010269 return length;
10270}
10271
Victor Stinner9310abb2011-10-05 00:59:23 +020010272static PyObject *
10273pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010274 Py_ssize_t left,
10275 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 PyObject *u;
10279 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010280 int kind;
10281 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282
10283 if (left < 0)
10284 left = 0;
10285 if (right < 0)
10286 right = 0;
10287
Victor Stinnerc4b49542011-12-11 22:44:26 +010010288 if (left == 0 && right == 0)
10289 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10292 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010293 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10294 return NULL;
10295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010297 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010299 if (!u)
10300 return NULL;
10301
10302 kind = PyUnicode_KIND(u);
10303 data = PyUnicode_DATA(u);
10304 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010305 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010306 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010307 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010308 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010309 assert(_PyUnicode_CheckConsistency(u, 1));
10310 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311}
10312
Alexander Belopolsky40018472011-02-26 01:02:56 +000010313PyObject *
10314PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010318 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320
Benjamin Petersonead6b532011-12-20 17:23:42 -060010321 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010323 if (PyUnicode_IS_ASCII(string))
10324 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010325 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 PyUnicode_GET_LENGTH(string), keepends);
10327 else
10328 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 break;
10332 case PyUnicode_2BYTE_KIND:
10333 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 PyUnicode_GET_LENGTH(string), keepends);
10336 break;
10337 case PyUnicode_4BYTE_KIND:
10338 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 PyUnicode_GET_LENGTH(string), keepends);
10341 break;
10342 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010343 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346}
10347
Alexander Belopolsky40018472011-02-26 01:02:56 +000010348static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010349split(PyObject *self,
10350 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010351 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010353 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 void *buf1, *buf2;
10355 Py_ssize_t len1, len2;
10356 PyObject* out;
10357
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010359 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (PyUnicode_READY(self) == -1)
10362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010365 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 if (PyUnicode_IS_ASCII(self))
10368 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010369 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010370 PyUnicode_GET_LENGTH(self), maxcount
10371 );
10372 else
10373 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010374 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 PyUnicode_GET_LENGTH(self), maxcount
10376 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 case PyUnicode_2BYTE_KIND:
10378 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 PyUnicode_GET_LENGTH(self), maxcount
10381 );
10382 case PyUnicode_4BYTE_KIND:
10383 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 PyUnicode_GET_LENGTH(self), maxcount
10386 );
10387 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010388 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 }
10390
10391 if (PyUnicode_READY(substring) == -1)
10392 return NULL;
10393
10394 kind1 = PyUnicode_KIND(self);
10395 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 len1 = PyUnicode_GET_LENGTH(self);
10397 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010398 if (kind1 < kind2 || len1 < len2) {
10399 out = PyList_New(1);
10400 if (out == NULL)
10401 return NULL;
10402 Py_INCREF(self);
10403 PyList_SET_ITEM(out, 0, self);
10404 return out;
10405 }
10406 buf1 = PyUnicode_DATA(self);
10407 buf2 = PyUnicode_DATA(substring);
10408 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010409 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010410 if (!buf2)
10411 return NULL;
10412 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010414 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010416 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10417 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010418 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419 else
10420 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010421 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 break;
10423 case PyUnicode_2BYTE_KIND:
10424 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010425 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 break;
10427 case PyUnicode_4BYTE_KIND:
10428 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010429 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 break;
10431 default:
10432 out = NULL;
10433 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010434 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 PyMem_Free(buf2);
10436 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437}
10438
Alexander Belopolsky40018472011-02-26 01:02:56 +000010439static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010440rsplit(PyObject *self,
10441 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010442 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010443{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010444 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 void *buf1, *buf2;
10446 Py_ssize_t len1, len2;
10447 PyObject* out;
10448
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010449 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010450 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (PyUnicode_READY(self) == -1)
10453 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010456 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010458 if (PyUnicode_IS_ASCII(self))
10459 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010460 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010461 PyUnicode_GET_LENGTH(self), maxcount
10462 );
10463 else
10464 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010465 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010466 PyUnicode_GET_LENGTH(self), maxcount
10467 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 case PyUnicode_2BYTE_KIND:
10469 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010470 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 PyUnicode_GET_LENGTH(self), maxcount
10472 );
10473 case PyUnicode_4BYTE_KIND:
10474 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010475 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 PyUnicode_GET_LENGTH(self), maxcount
10477 );
10478 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010479 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 }
10481
10482 if (PyUnicode_READY(substring) == -1)
10483 return NULL;
10484
10485 kind1 = PyUnicode_KIND(self);
10486 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 len1 = PyUnicode_GET_LENGTH(self);
10488 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010489 if (kind1 < kind2 || len1 < len2) {
10490 out = PyList_New(1);
10491 if (out == NULL)
10492 return NULL;
10493 Py_INCREF(self);
10494 PyList_SET_ITEM(out, 0, self);
10495 return out;
10496 }
10497 buf1 = PyUnicode_DATA(self);
10498 buf2 = PyUnicode_DATA(substring);
10499 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010500 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010501 if (!buf2)
10502 return NULL;
10503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010505 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010507 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10508 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010509 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010510 else
10511 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010512 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 break;
10514 case PyUnicode_2BYTE_KIND:
10515 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010516 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 break;
10518 case PyUnicode_4BYTE_KIND:
10519 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010520 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 break;
10522 default:
10523 out = NULL;
10524 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010525 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 PyMem_Free(buf2);
10527 return out;
10528}
10529
10530static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010531anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10532 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010534 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010536 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10537 return asciilib_find(buf1, len1, buf2, len2, offset);
10538 else
10539 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 case PyUnicode_2BYTE_KIND:
10541 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10542 case PyUnicode_4BYTE_KIND:
10543 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10544 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010545 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546}
10547
10548static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010549anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10550 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010552 switch (kind) {
10553 case PyUnicode_1BYTE_KIND:
10554 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10555 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10556 else
10557 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10558 case PyUnicode_2BYTE_KIND:
10559 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10560 case PyUnicode_4BYTE_KIND:
10561 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10562 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010563 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010564}
10565
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010566static void
10567replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10568 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10569{
10570 int kind = PyUnicode_KIND(u);
10571 void *data = PyUnicode_DATA(u);
10572 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10573 if (kind == PyUnicode_1BYTE_KIND) {
10574 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10575 (Py_UCS1 *)data + len,
10576 u1, u2, maxcount);
10577 }
10578 else if (kind == PyUnicode_2BYTE_KIND) {
10579 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10580 (Py_UCS2 *)data + len,
10581 u1, u2, maxcount);
10582 }
10583 else {
10584 assert(kind == PyUnicode_4BYTE_KIND);
10585 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10586 (Py_UCS4 *)data + len,
10587 u1, u2, maxcount);
10588 }
10589}
10590
Alexander Belopolsky40018472011-02-26 01:02:56 +000010591static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592replace(PyObject *self, PyObject *str1,
10593 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 PyObject *u;
10596 char *sbuf = PyUnicode_DATA(self);
10597 char *buf1 = PyUnicode_DATA(str1);
10598 char *buf2 = PyUnicode_DATA(str2);
10599 int srelease = 0, release1 = 0, release2 = 0;
10600 int skind = PyUnicode_KIND(self);
10601 int kind1 = PyUnicode_KIND(str1);
10602 int kind2 = PyUnicode_KIND(str2);
10603 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10604 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10605 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010607 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010609 if (slen < len1)
10610 goto nothing;
10611
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010613 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010614 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010615 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616
Victor Stinner59de0ee2011-10-07 10:01:28 +020010617 if (str1 == str2)
10618 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010621 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10622 if (maxchar < maxchar_str1)
10623 /* substring too wide to be present */
10624 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010625 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10626 /* Replacing str1 with str2 may cause a maxchar reduction in the
10627 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010628 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010629 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010634 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010637 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010638 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010639
Victor Stinner69ed0f42013-04-09 21:48:24 +020010640 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010641 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010642 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010644 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010648
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010649 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10650 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 }
10652 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 int rkind = skind;
10654 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010655 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (kind1 < rkind) {
10658 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010659 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (!buf1) goto error;
10661 release1 = 1;
10662 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010663 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 if (i < 0)
10665 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (rkind > kind2) {
10667 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010668 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 if (!buf2) goto error;
10670 release2 = 1;
10671 }
10672 else if (rkind < kind2) {
10673 /* widen self and buf1 */
10674 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010675 if (release1) {
10676 PyMem_Free(buf1);
10677 buf1 = PyUnicode_DATA(str1);
10678 release1 = 0;
10679 }
10680 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (!sbuf) goto error;
10682 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010683 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (!buf1) goto error;
10685 release1 = 1;
10686 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 u = PyUnicode_New(slen, maxchar);
10688 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010690 assert(PyUnicode_KIND(u) == rkind);
10691 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010692
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010693 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010694 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010695 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010697 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010699
10700 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010701 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010702 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010703 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010704 if (i == -1)
10705 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010706 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010708 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010712 }
10713 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010715 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 int rkind = skind;
10717 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010720 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010721 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (!buf1) goto error;
10723 release1 = 1;
10724 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010725 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 if (n == 0)
10727 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010729 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010730 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (!buf2) goto error;
10732 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010735 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010737 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 if (!sbuf) goto error;
10739 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010740 if (release1) {
10741 PyMem_Free(buf1);
10742 buf1 = PyUnicode_DATA(str1);
10743 release1 = 0;
10744 }
10745 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 if (!buf1) goto error;
10747 release1 = 1;
10748 }
10749 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10750 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010751 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 PyErr_SetString(PyExc_OverflowError,
10753 "replace string is too long");
10754 goto error;
10755 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010756 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010757 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010758 _Py_INCREF_UNICODE_EMPTY();
10759 if (!unicode_empty)
10760 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010761 u = unicode_empty;
10762 goto done;
10763 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010764 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 PyErr_SetString(PyExc_OverflowError,
10766 "replace string is too long");
10767 goto error;
10768 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010769 u = PyUnicode_New(new_size, maxchar);
10770 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010772 assert(PyUnicode_KIND(u) == rkind);
10773 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 ires = i = 0;
10775 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010776 while (n-- > 0) {
10777 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010778 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010779 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010780 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010781 if (j == -1)
10782 break;
10783 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010784 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010785 memcpy(res + rkind * ires,
10786 sbuf + rkind * i,
10787 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789 }
10790 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010792 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010794 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010800 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010801 memcpy(res + rkind * ires,
10802 sbuf + rkind * i,
10803 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010804 }
10805 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010806 /* interleave */
10807 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010808 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010810 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010812 if (--n <= 0)
10813 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010814 memcpy(res + rkind * ires,
10815 sbuf + rkind * i,
10816 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 ires++;
10818 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010819 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010820 memcpy(res + rkind * ires,
10821 sbuf + rkind * i,
10822 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010823 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010824 }
10825
10826 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010827 unicode_adjust_maxchar(&u);
10828 if (u == NULL)
10829 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010831
10832 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if (srelease)
10834 PyMem_FREE(sbuf);
10835 if (release1)
10836 PyMem_FREE(buf1);
10837 if (release2)
10838 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010839 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010841
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010843 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (srelease)
10845 PyMem_FREE(sbuf);
10846 if (release1)
10847 PyMem_FREE(buf1);
10848 if (release2)
10849 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010850 return unicode_result_unchanged(self);
10851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 error:
10853 if (srelease && sbuf)
10854 PyMem_FREE(sbuf);
10855 if (release1 && buf1)
10856 PyMem_FREE(buf1);
10857 if (release2 && buf2)
10858 PyMem_FREE(buf2);
10859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860}
10861
10862/* --- Unicode Object Methods --------------------------------------------- */
10863
INADA Naoki3ae20562017-01-16 20:41:20 +090010864/*[clinic input]
10865str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866
INADA Naoki3ae20562017-01-16 20:41:20 +090010867Return a version of the string where each word is titlecased.
10868
10869More specifically, words start with uppercased characters and all remaining
10870cased characters have lower case.
10871[clinic start generated code]*/
10872
10873static PyObject *
10874unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010875/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010877 if (PyUnicode_READY(self) == -1)
10878 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010879 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880}
10881
INADA Naoki3ae20562017-01-16 20:41:20 +090010882/*[clinic input]
10883str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884
INADA Naoki3ae20562017-01-16 20:41:20 +090010885Return a capitalized version of the string.
10886
10887More specifically, make the first character have upper case and the rest lower
10888case.
10889[clinic start generated code]*/
10890
10891static PyObject *
10892unicode_capitalize_impl(PyObject *self)
10893/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010895 if (PyUnicode_READY(self) == -1)
10896 return NULL;
10897 if (PyUnicode_GET_LENGTH(self) == 0)
10898 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010899 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900}
10901
INADA Naoki3ae20562017-01-16 20:41:20 +090010902/*[clinic input]
10903str.casefold as unicode_casefold
10904
10905Return a version of the string suitable for caseless comparisons.
10906[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010907
10908static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010909unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010910/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010911{
10912 if (PyUnicode_READY(self) == -1)
10913 return NULL;
10914 if (PyUnicode_IS_ASCII(self))
10915 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010916 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010917}
10918
10919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010920/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010921
10922static int
10923convert_uc(PyObject *obj, void *addr)
10924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010927 if (!PyUnicode_Check(obj)) {
10928 PyErr_Format(PyExc_TypeError,
10929 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010930 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010931 return 0;
10932 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010933 if (PyUnicode_READY(obj) < 0)
10934 return 0;
10935 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010936 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010937 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010938 return 0;
10939 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010940 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010941 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010942}
10943
INADA Naoki3ae20562017-01-16 20:41:20 +090010944/*[clinic input]
10945str.center as unicode_center
10946
10947 width: Py_ssize_t
10948 fillchar: Py_UCS4 = ' '
10949 /
10950
10951Return a centered string of length width.
10952
10953Padding is done using the specified fill character (default is a space).
10954[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
10956static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010957unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10958/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010960 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
Benjamin Petersonbac79492012-01-14 13:34:47 -050010962 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963 return NULL;
10964
Victor Stinnerc4b49542011-12-11 22:44:26 +010010965 if (PyUnicode_GET_LENGTH(self) >= width)
10966 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
Victor Stinnerc4b49542011-12-11 22:44:26 +010010968 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 left = marg / 2 + (marg & width & 1);
10970
Victor Stinner9310abb2011-10-05 00:59:23 +020010971 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972}
10973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974/* This function assumes that str1 and str2 are readied by the caller. */
10975
Marc-André Lemburge5034372000-08-08 08:04:29 +000010976static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010977unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010978{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010979#define COMPARE(TYPE1, TYPE2) \
10980 do { \
10981 TYPE1* p1 = (TYPE1 *)data1; \
10982 TYPE2* p2 = (TYPE2 *)data2; \
10983 TYPE1* end = p1 + len; \
10984 Py_UCS4 c1, c2; \
10985 for (; p1 != end; p1++, p2++) { \
10986 c1 = *p1; \
10987 c2 = *p2; \
10988 if (c1 != c2) \
10989 return (c1 < c2) ? -1 : 1; \
10990 } \
10991 } \
10992 while (0)
10993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 int kind1, kind2;
10995 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010996 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 kind1 = PyUnicode_KIND(str1);
10999 kind2 = PyUnicode_KIND(str2);
11000 data1 = PyUnicode_DATA(str1);
11001 data2 = PyUnicode_DATA(str2);
11002 len1 = PyUnicode_GET_LENGTH(str1);
11003 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011004 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011005
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011006 switch(kind1) {
11007 case PyUnicode_1BYTE_KIND:
11008 {
11009 switch(kind2) {
11010 case PyUnicode_1BYTE_KIND:
11011 {
11012 int cmp = memcmp(data1, data2, len);
11013 /* normalize result of memcmp() into the range [-1; 1] */
11014 if (cmp < 0)
11015 return -1;
11016 if (cmp > 0)
11017 return 1;
11018 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011019 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011020 case PyUnicode_2BYTE_KIND:
11021 COMPARE(Py_UCS1, Py_UCS2);
11022 break;
11023 case PyUnicode_4BYTE_KIND:
11024 COMPARE(Py_UCS1, Py_UCS4);
11025 break;
11026 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011027 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011028 }
11029 break;
11030 }
11031 case PyUnicode_2BYTE_KIND:
11032 {
11033 switch(kind2) {
11034 case PyUnicode_1BYTE_KIND:
11035 COMPARE(Py_UCS2, Py_UCS1);
11036 break;
11037 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011038 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011039 COMPARE(Py_UCS2, Py_UCS2);
11040 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011041 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011042 case PyUnicode_4BYTE_KIND:
11043 COMPARE(Py_UCS2, Py_UCS4);
11044 break;
11045 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011046 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011047 }
11048 break;
11049 }
11050 case PyUnicode_4BYTE_KIND:
11051 {
11052 switch(kind2) {
11053 case PyUnicode_1BYTE_KIND:
11054 COMPARE(Py_UCS4, Py_UCS1);
11055 break;
11056 case PyUnicode_2BYTE_KIND:
11057 COMPARE(Py_UCS4, Py_UCS2);
11058 break;
11059 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011060 {
11061#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11062 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11063 /* normalize result of wmemcmp() into the range [-1; 1] */
11064 if (cmp < 0)
11065 return -1;
11066 if (cmp > 0)
11067 return 1;
11068#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011069 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011070#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011071 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011072 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011073 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011074 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011075 }
11076 break;
11077 }
11078 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011079 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011080 }
11081
Victor Stinner770e19e2012-10-04 22:59:45 +020011082 if (len1 == len2)
11083 return 0;
11084 if (len1 < len2)
11085 return -1;
11086 else
11087 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011088
11089#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011090}
11091
Benjamin Peterson621b4302016-09-09 13:54:34 -070011092static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011093unicode_compare_eq(PyObject *str1, PyObject *str2)
11094{
11095 int kind;
11096 void *data1, *data2;
11097 Py_ssize_t len;
11098 int cmp;
11099
Victor Stinnere5567ad2012-10-23 02:48:49 +020011100 len = PyUnicode_GET_LENGTH(str1);
11101 if (PyUnicode_GET_LENGTH(str2) != len)
11102 return 0;
11103 kind = PyUnicode_KIND(str1);
11104 if (PyUnicode_KIND(str2) != kind)
11105 return 0;
11106 data1 = PyUnicode_DATA(str1);
11107 data2 = PyUnicode_DATA(str2);
11108
11109 cmp = memcmp(data1, data2, len * kind);
11110 return (cmp == 0);
11111}
11112
11113
Alexander Belopolsky40018472011-02-26 01:02:56 +000011114int
11115PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11118 if (PyUnicode_READY(left) == -1 ||
11119 PyUnicode_READY(right) == -1)
11120 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011121
11122 /* a string is equal to itself */
11123 if (left == right)
11124 return 0;
11125
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011126 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011128 PyErr_Format(PyExc_TypeError,
11129 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011130 Py_TYPE(left)->tp_name,
11131 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 return -1;
11133}
11134
Martin v. Löwis5b222132007-06-10 09:51:05 +000011135int
11136PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 Py_ssize_t i;
11139 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011141 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142
Victor Stinner910337b2011-10-03 03:20:16 +020011143 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011144 if (!PyUnicode_IS_READY(uni)) {
11145 const wchar_t *ws = _PyUnicode_WSTR(uni);
11146 /* Compare Unicode string and source character set string */
11147 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11148 if (chr != ustr[i])
11149 return (chr < ustr[i]) ? -1 : 1;
11150 }
11151 /* This check keeps Python strings that end in '\0' from comparing equal
11152 to C strings identical up to that point. */
11153 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11154 return 1; /* uni is longer */
11155 if (ustr[i])
11156 return -1; /* str is longer */
11157 return 0;
11158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011160 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011161 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011162 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011163 size_t len, len2 = strlen(str);
11164 int cmp;
11165
11166 len = Py_MIN(len1, len2);
11167 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011168 if (cmp != 0) {
11169 if (cmp < 0)
11170 return -1;
11171 else
11172 return 1;
11173 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011174 if (len1 > len2)
11175 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011176 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011177 return -1; /* str is longer */
11178 return 0;
11179 }
11180 else {
11181 void *data = PyUnicode_DATA(uni);
11182 /* Compare Unicode string and source character set string */
11183 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011184 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011185 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11186 /* This check keeps Python strings that end in '\0' from comparing equal
11187 to C strings identical up to that point. */
11188 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11189 return 1; /* uni is longer */
11190 if (str[i])
11191 return -1; /* str is longer */
11192 return 0;
11193 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011194}
11195
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011196static int
11197non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11198{
11199 size_t i, len;
11200 const wchar_t *p;
11201 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11202 if (strlen(str) != len)
11203 return 0;
11204 p = _PyUnicode_WSTR(unicode);
11205 assert(p);
11206 for (i = 0; i < len; i++) {
11207 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011208 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011209 return 0;
11210 }
11211 return 1;
11212}
11213
11214int
11215_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11216{
11217 size_t len;
11218 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011219 assert(str);
11220#ifndef NDEBUG
11221 for (const char *p = str; *p; p++) {
11222 assert((unsigned char)*p < 128);
11223 }
11224#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011225 if (PyUnicode_READY(unicode) == -1) {
11226 /* Memory error or bad data */
11227 PyErr_Clear();
11228 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11229 }
11230 if (!PyUnicode_IS_ASCII(unicode))
11231 return 0;
11232 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11233 return strlen(str) == len &&
11234 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11235}
11236
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011237int
11238_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11239{
11240 PyObject *right_uni;
11241 Py_hash_t hash;
11242
11243 assert(_PyUnicode_CHECK(left));
11244 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011245#ifndef NDEBUG
11246 for (const char *p = right->string; *p; p++) {
11247 assert((unsigned char)*p < 128);
11248 }
11249#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011250
11251 if (PyUnicode_READY(left) == -1) {
11252 /* memory error or bad data */
11253 PyErr_Clear();
11254 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11255 }
11256
11257 if (!PyUnicode_IS_ASCII(left))
11258 return 0;
11259
11260 right_uni = _PyUnicode_FromId(right); /* borrowed */
11261 if (right_uni == NULL) {
11262 /* memory error or bad data */
11263 PyErr_Clear();
11264 return _PyUnicode_EqualToASCIIString(left, right->string);
11265 }
11266
11267 if (left == right_uni)
11268 return 1;
11269
11270 if (PyUnicode_CHECK_INTERNED(left))
11271 return 0;
11272
INADA Naoki7cc95f52018-01-28 02:07:09 +090011273 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011274 hash = _PyUnicode_HASH(left);
11275 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11276 return 0;
11277
11278 return unicode_compare_eq(left, right_uni);
11279}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011280
Alexander Belopolsky40018472011-02-26 01:02:56 +000011281PyObject *
11282PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011283{
11284 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011285
Victor Stinnere5567ad2012-10-23 02:48:49 +020011286 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11287 Py_RETURN_NOTIMPLEMENTED;
11288
11289 if (PyUnicode_READY(left) == -1 ||
11290 PyUnicode_READY(right) == -1)
11291 return NULL;
11292
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011293 if (left == right) {
11294 switch (op) {
11295 case Py_EQ:
11296 case Py_LE:
11297 case Py_GE:
11298 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011299 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011300 case Py_NE:
11301 case Py_LT:
11302 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011303 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011304 default:
11305 PyErr_BadArgument();
11306 return NULL;
11307 }
11308 }
11309 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011310 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011311 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011312 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011313 }
11314 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011315 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011316 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011317 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011318}
11319
Alexander Belopolsky40018472011-02-26 01:02:56 +000011320int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011321_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11322{
11323 return unicode_eq(aa, bb);
11324}
11325
11326int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011327PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011328{
Victor Stinner77282cb2013-04-14 19:22:47 +020011329 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 void *buf1, *buf2;
11331 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011332 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011333
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011334 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011336 "'in <string>' requires string as left operand, not %.100s",
11337 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011338 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011339 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011340 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011341 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011342 if (ensure_unicode(str) < 0)
11343 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011346 kind2 = PyUnicode_KIND(substr);
11347 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011348 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011350 len2 = PyUnicode_GET_LENGTH(substr);
11351 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011352 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011353 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011354 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011355 if (len2 == 1) {
11356 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11357 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011358 return result;
11359 }
11360 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011361 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011362 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011363 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365
Victor Stinner77282cb2013-04-14 19:22:47 +020011366 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 case PyUnicode_1BYTE_KIND:
11368 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11369 break;
11370 case PyUnicode_2BYTE_KIND:
11371 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11372 break;
11373 case PyUnicode_4BYTE_KIND:
11374 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11375 break;
11376 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011377 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011379
Victor Stinner77282cb2013-04-14 19:22:47 +020011380 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 PyMem_Free(buf2);
11382
Guido van Rossum403d68b2000-03-13 15:55:09 +000011383 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011384}
11385
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386/* Concat to string or Unicode object giving a new Unicode object. */
11387
Alexander Belopolsky40018472011-02-26 01:02:56 +000011388PyObject *
11389PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011391 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011392 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011393 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011395 if (ensure_unicode(left) < 0)
11396 return NULL;
11397
11398 if (!PyUnicode_Check(right)) {
11399 PyErr_Format(PyExc_TypeError,
11400 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011401 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011402 return NULL;
11403 }
11404 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406
11407 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011408 if (left == unicode_empty)
11409 return PyUnicode_FromObject(right);
11410 if (right == unicode_empty)
11411 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011413 left_len = PyUnicode_GET_LENGTH(left);
11414 right_len = PyUnicode_GET_LENGTH(right);
11415 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011416 PyErr_SetString(PyExc_OverflowError,
11417 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011418 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011419 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011420 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011421
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011422 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11423 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011424 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011427 result = PyUnicode_New(new_len, maxchar);
11428 if (result == NULL)
11429 return NULL;
11430 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11431 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11432 assert(_PyUnicode_CheckConsistency(result, 1));
11433 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434}
11435
Walter Dörwald1ab83302007-05-18 17:15:44 +000011436void
Victor Stinner23e56682011-10-03 03:54:37 +020011437PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011438{
Victor Stinner23e56682011-10-03 03:54:37 +020011439 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011440 Py_UCS4 maxchar, maxchar2;
11441 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011442
11443 if (p_left == NULL) {
11444 if (!PyErr_Occurred())
11445 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011446 return;
11447 }
Victor Stinner23e56682011-10-03 03:54:37 +020011448 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011449 if (right == NULL || left == NULL
11450 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011451 if (!PyErr_Occurred())
11452 PyErr_BadInternalCall();
11453 goto error;
11454 }
11455
Benjamin Petersonbac79492012-01-14 13:34:47 -050011456 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011457 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011458 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011459 goto error;
11460
Victor Stinner488fa492011-12-12 00:01:39 +010011461 /* Shortcuts */
11462 if (left == unicode_empty) {
11463 Py_DECREF(left);
11464 Py_INCREF(right);
11465 *p_left = right;
11466 return;
11467 }
11468 if (right == unicode_empty)
11469 return;
11470
11471 left_len = PyUnicode_GET_LENGTH(left);
11472 right_len = PyUnicode_GET_LENGTH(right);
11473 if (left_len > PY_SSIZE_T_MAX - right_len) {
11474 PyErr_SetString(PyExc_OverflowError,
11475 "strings are too large to concat");
11476 goto error;
11477 }
11478 new_len = left_len + right_len;
11479
11480 if (unicode_modifiable(left)
11481 && PyUnicode_CheckExact(right)
11482 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011483 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11484 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011485 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011486 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011487 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11488 {
11489 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011490 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011491 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011492
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011493 /* copy 'right' into the newly allocated area of 'left' */
11494 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011495 }
Victor Stinner488fa492011-12-12 00:01:39 +010011496 else {
11497 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11498 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011499 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011500
Victor Stinner488fa492011-12-12 00:01:39 +010011501 /* Concat the two Unicode strings */
11502 res = PyUnicode_New(new_len, maxchar);
11503 if (res == NULL)
11504 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011505 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11506 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011507 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011508 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011509 }
11510 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011511 return;
11512
11513error:
Victor Stinner488fa492011-12-12 00:01:39 +010011514 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011515}
11516
11517void
11518PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11519{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011520 PyUnicode_Append(pleft, right);
11521 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011522}
11523
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011524/*
11525Wraps stringlib_parse_args_finds() and additionally ensures that the
11526first argument is a unicode object.
11527*/
11528
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011529static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011530parse_args_finds_unicode(const char * function_name, PyObject *args,
11531 PyObject **substring,
11532 Py_ssize_t *start, Py_ssize_t *end)
11533{
11534 if(stringlib_parse_args_finds(function_name, args, substring,
11535 start, end)) {
11536 if (ensure_unicode(*substring) < 0)
11537 return 0;
11538 return 1;
11539 }
11540 return 0;
11541}
11542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011543PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011546Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011547string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011548interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
11550static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011551unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011553 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011554 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011555 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011557 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 void *buf1, *buf2;
11559 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011561 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 kind1 = PyUnicode_KIND(self);
11565 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011566 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011567 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 len1 = PyUnicode_GET_LENGTH(self);
11570 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011572 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011573 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011574
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011575 buf1 = PyUnicode_DATA(self);
11576 buf2 = PyUnicode_DATA(substring);
11577 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011578 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011579 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011580 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011581 }
11582 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 case PyUnicode_1BYTE_KIND:
11584 iresult = ucs1lib_count(
11585 ((Py_UCS1*)buf1) + start, end - start,
11586 buf2, len2, PY_SSIZE_T_MAX
11587 );
11588 break;
11589 case PyUnicode_2BYTE_KIND:
11590 iresult = ucs2lib_count(
11591 ((Py_UCS2*)buf1) + start, end - start,
11592 buf2, len2, PY_SSIZE_T_MAX
11593 );
11594 break;
11595 case PyUnicode_4BYTE_KIND:
11596 iresult = ucs4lib_count(
11597 ((Py_UCS4*)buf1) + start, end - start,
11598 buf2, len2, PY_SSIZE_T_MAX
11599 );
11600 break;
11601 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011602 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603 }
11604
11605 result = PyLong_FromSsize_t(iresult);
11606
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011607 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610 return result;
11611}
11612
INADA Naoki3ae20562017-01-16 20:41:20 +090011613/*[clinic input]
11614str.encode as unicode_encode
11615
11616 encoding: str(c_default="NULL") = 'utf-8'
11617 The encoding in which to encode the string.
11618 errors: str(c_default="NULL") = 'strict'
11619 The error handling scheme to use for encoding errors.
11620 The default is 'strict' meaning that encoding errors raise a
11621 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11622 'xmlcharrefreplace' as well as any other name registered with
11623 codecs.register_error that can handle UnicodeEncodeErrors.
11624
11625Encode the string using the codec registered for encoding.
11626[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
11628static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011629unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011630/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011632 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011633}
11634
INADA Naoki3ae20562017-01-16 20:41:20 +090011635/*[clinic input]
11636str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
INADA Naoki3ae20562017-01-16 20:41:20 +090011638 tabsize: int = 8
11639
11640Return a copy where all tab characters are expanded using spaces.
11641
11642If tabsize is not given, a tab size of 8 characters is assumed.
11643[clinic start generated code]*/
11644
11645static PyObject *
11646unicode_expandtabs_impl(PyObject *self, int tabsize)
11647/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011649 Py_ssize_t i, j, line_pos, src_len, incr;
11650 Py_UCS4 ch;
11651 PyObject *u;
11652 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011653 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011654 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655
Antoine Pitrou22425222011-10-04 19:10:51 +020011656 if (PyUnicode_READY(self) == -1)
11657 return NULL;
11658
Thomas Wouters7e474022000-07-16 12:04:32 +000011659 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011660 src_len = PyUnicode_GET_LENGTH(self);
11661 i = j = line_pos = 0;
11662 kind = PyUnicode_KIND(self);
11663 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011664 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011665 for (; i < src_len; i++) {
11666 ch = PyUnicode_READ(kind, src_data, i);
11667 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011668 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011669 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011670 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011672 goto overflow;
11673 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011675 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011678 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011679 goto overflow;
11680 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011682 if (ch == '\n' || ch == '\r')
11683 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011685 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011686 if (!found)
11687 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011688
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011690 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 if (!u)
11692 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011693 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
Antoine Pitroue71d5742011-10-04 15:55:09 +020011695 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
Antoine Pitroue71d5742011-10-04 15:55:09 +020011697 for (; i < src_len; i++) {
11698 ch = PyUnicode_READ(kind, src_data, i);
11699 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011701 incr = tabsize - (line_pos % tabsize);
11702 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011703 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011704 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011706 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011707 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011708 line_pos++;
11709 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011710 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011711 if (ch == '\n' || ch == '\r')
11712 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011714 }
11715 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011716 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011717
Antoine Pitroue71d5742011-10-04 15:55:09 +020011718 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011719 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721}
11722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725\n\
11726Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011727such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728arguments start and end are interpreted as in slice notation.\n\
11729\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011730Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
11732static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011733unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011735 /* initialize variables to prevent gcc warning */
11736 PyObject *substring = NULL;
11737 Py_ssize_t start = 0;
11738 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011739 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011741 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011744 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011747 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (result == -2)
11750 return NULL;
11751
Christian Heimes217cfd12007-12-02 14:31:20 +000011752 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753}
11754
11755static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011756unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011758 void *data;
11759 enum PyUnicode_Kind kind;
11760 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011761
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011762 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011763 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011765 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011766 if (PyUnicode_READY(self) == -1) {
11767 return NULL;
11768 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011769 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11770 PyErr_SetString(PyExc_IndexError, "string index out of range");
11771 return NULL;
11772 }
11773 kind = PyUnicode_KIND(self);
11774 data = PyUnicode_DATA(self);
11775 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011776 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777}
11778
Guido van Rossumc2504932007-09-18 19:42:40 +000011779/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011780 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011781static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011782unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011784 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011785
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011786#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011787 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011788#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 if (_PyUnicode_HASH(self) != -1)
11790 return _PyUnicode_HASH(self);
11791 if (PyUnicode_READY(self) == -1)
11792 return -1;
animalizea1d14252019-01-02 20:16:06 +080011793
Christian Heimes985ecdc2013-11-20 11:46:18 +010011794 x = _Py_HashBytes(PyUnicode_DATA(self),
11795 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011797 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798}
11799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011800PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802\n\
oldkaa0735f2018-02-02 16:52:55 +080011803Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011804such that sub is contained within S[start:end]. Optional\n\
11805arguments start and end are interpreted as in slice notation.\n\
11806\n\
11807Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
11809static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011812 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011813 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011814 PyObject *substring = NULL;
11815 Py_ssize_t start = 0;
11816 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011818 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011821 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011824 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 if (result == -2)
11827 return NULL;
11828
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 if (result < 0) {
11830 PyErr_SetString(PyExc_ValueError, "substring not found");
11831 return NULL;
11832 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011833
Christian Heimes217cfd12007-12-02 14:31:20 +000011834 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835}
11836
INADA Naoki3ae20562017-01-16 20:41:20 +090011837/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011838str.isascii as unicode_isascii
11839
11840Return True if all characters in the string are ASCII, False otherwise.
11841
11842ASCII characters have code points in the range U+0000-U+007F.
11843Empty string is ASCII too.
11844[clinic start generated code]*/
11845
11846static PyObject *
11847unicode_isascii_impl(PyObject *self)
11848/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11849{
11850 if (PyUnicode_READY(self) == -1) {
11851 return NULL;
11852 }
11853 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11854}
11855
11856/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011857str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
INADA Naoki3ae20562017-01-16 20:41:20 +090011859Return True if the string is a lowercase string, False otherwise.
11860
11861A string is lowercase if all cased characters in the string are lowercase and
11862there is at least one cased character in the string.
11863[clinic start generated code]*/
11864
11865static PyObject *
11866unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011867/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 Py_ssize_t i, length;
11870 int kind;
11871 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 int cased;
11873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (PyUnicode_READY(self) == -1)
11875 return NULL;
11876 length = PyUnicode_GET_LENGTH(self);
11877 kind = PyUnicode_KIND(self);
11878 data = PyUnicode_DATA(self);
11879
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (length == 1)
11882 return PyBool_FromLong(
11883 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011885 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011887 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011888
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 for (i = 0; i < length; i++) {
11891 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011892
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011894 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 else if (!cased && Py_UNICODE_ISLOWER(ch))
11896 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011898 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899}
11900
INADA Naoki3ae20562017-01-16 20:41:20 +090011901/*[clinic input]
11902str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904Return True if the string is an uppercase string, False otherwise.
11905
11906A string is uppercase if all cased characters in the string are uppercase and
11907there is at least one cased character in the string.
11908[clinic start generated code]*/
11909
11910static PyObject *
11911unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011912/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_ssize_t i, length;
11915 int kind;
11916 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 int cased;
11918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (PyUnicode_READY(self) == -1)
11920 return NULL;
11921 length = PyUnicode_GET_LENGTH(self);
11922 kind = PyUnicode_KIND(self);
11923 data = PyUnicode_DATA(self);
11924
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (length == 1)
11927 return PyBool_FromLong(
11928 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011930 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011932 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011933
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 for (i = 0; i < length; i++) {
11936 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011937
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011939 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 else if (!cased && Py_UNICODE_ISUPPER(ch))
11941 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011943 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944}
11945
INADA Naoki3ae20562017-01-16 20:41:20 +090011946/*[clinic input]
11947str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948
INADA Naoki3ae20562017-01-16 20:41:20 +090011949Return True if the string is a title-cased string, False otherwise.
11950
11951In a title-cased string, upper- and title-case characters may only
11952follow uncased characters and lowercase characters only cased ones.
11953[clinic start generated code]*/
11954
11955static PyObject *
11956unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011957/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 Py_ssize_t i, length;
11960 int kind;
11961 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 int cased, previous_is_cased;
11963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 if (PyUnicode_READY(self) == -1)
11965 return NULL;
11966 length = PyUnicode_GET_LENGTH(self);
11967 kind = PyUnicode_KIND(self);
11968 data = PyUnicode_DATA(self);
11969
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (length == 1) {
11972 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11973 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11974 (Py_UNICODE_ISUPPER(ch) != 0));
11975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011977 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011979 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011980
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981 cased = 0;
11982 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 for (i = 0; i < length; i++) {
11984 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011985
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11987 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011988 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 previous_is_cased = 1;
11990 cased = 1;
11991 }
11992 else if (Py_UNICODE_ISLOWER(ch)) {
11993 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011994 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 previous_is_cased = 1;
11996 cased = 1;
11997 }
11998 else
11999 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012001 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002}
12003
INADA Naoki3ae20562017-01-16 20:41:20 +090012004/*[clinic input]
12005str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
INADA Naoki3ae20562017-01-16 20:41:20 +090012007Return True if the string is a whitespace string, False otherwise.
12008
12009A string is whitespace if all characters in the string are whitespace and there
12010is at least one character in the string.
12011[clinic start generated code]*/
12012
12013static PyObject *
12014unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012015/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 Py_ssize_t i, length;
12018 int kind;
12019 void *data;
12020
12021 if (PyUnicode_READY(self) == -1)
12022 return NULL;
12023 length = PyUnicode_GET_LENGTH(self);
12024 kind = PyUnicode_KIND(self);
12025 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 if (length == 1)
12029 return PyBool_FromLong(
12030 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012032 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012034 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 for (i = 0; i < length; i++) {
12037 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012038 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012039 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012041 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042}
12043
INADA Naoki3ae20562017-01-16 20:41:20 +090012044/*[clinic input]
12045str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012046
INADA Naoki3ae20562017-01-16 20:41:20 +090012047Return True if the string is an alphabetic string, False otherwise.
12048
12049A string is alphabetic if all characters in the string are alphabetic and there
12050is at least one character in the string.
12051[clinic start generated code]*/
12052
12053static PyObject *
12054unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012055/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057 Py_ssize_t i, length;
12058 int kind;
12059 void *data;
12060
12061 if (PyUnicode_READY(self) == -1)
12062 return NULL;
12063 length = PyUnicode_GET_LENGTH(self);
12064 kind = PyUnicode_KIND(self);
12065 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012066
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012067 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 if (length == 1)
12069 return PyBool_FromLong(
12070 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012071
12072 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012074 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 for (i = 0; i < length; i++) {
12077 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012078 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012079 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012080 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012081}
12082
INADA Naoki3ae20562017-01-16 20:41:20 +090012083/*[clinic input]
12084str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012085
INADA Naoki3ae20562017-01-16 20:41:20 +090012086Return True if the string is an alpha-numeric string, False otherwise.
12087
12088A string is alpha-numeric if all characters in the string are alpha-numeric and
12089there is at least one character in the string.
12090[clinic start generated code]*/
12091
12092static PyObject *
12093unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012094/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 int kind;
12097 void *data;
12098 Py_ssize_t len, i;
12099
12100 if (PyUnicode_READY(self) == -1)
12101 return NULL;
12102
12103 kind = PyUnicode_KIND(self);
12104 data = PyUnicode_DATA(self);
12105 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012106
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012107 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (len == 1) {
12109 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12110 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12111 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012112
12113 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012115 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 for (i = 0; i < len; i++) {
12118 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012119 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012120 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012121 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012122 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012123}
12124
INADA Naoki3ae20562017-01-16 20:41:20 +090012125/*[clinic input]
12126str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
INADA Naoki3ae20562017-01-16 20:41:20 +090012128Return True if the string is a decimal string, False otherwise.
12129
12130A string is a decimal string if all characters in the string are decimal and
12131there is at least one character in the string.
12132[clinic start generated code]*/
12133
12134static PyObject *
12135unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012136/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 Py_ssize_t i, length;
12139 int kind;
12140 void *data;
12141
12142 if (PyUnicode_READY(self) == -1)
12143 return NULL;
12144 length = PyUnicode_GET_LENGTH(self);
12145 kind = PyUnicode_KIND(self);
12146 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (length == 1)
12150 return PyBool_FromLong(
12151 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012153 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012155 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 for (i = 0; i < length; i++) {
12158 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012159 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012161 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162}
12163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164/*[clinic input]
12165str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166
INADA Naoki3ae20562017-01-16 20:41:20 +090012167Return True if the string is a digit string, False otherwise.
12168
12169A string is a digit string if all characters in the string are digits and there
12170is at least one character in the string.
12171[clinic start generated code]*/
12172
12173static PyObject *
12174unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012175/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 Py_ssize_t i, length;
12178 int kind;
12179 void *data;
12180
12181 if (PyUnicode_READY(self) == -1)
12182 return NULL;
12183 length = PyUnicode_GET_LENGTH(self);
12184 kind = PyUnicode_KIND(self);
12185 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 if (length == 1) {
12189 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12190 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012193 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012195 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 for (i = 0; i < length; i++) {
12198 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012199 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012201 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202}
12203
INADA Naoki3ae20562017-01-16 20:41:20 +090012204/*[clinic input]
12205str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
INADA Naoki3ae20562017-01-16 20:41:20 +090012207Return True if the string is a numeric string, False otherwise.
12208
12209A string is numeric if all characters in the string are numeric and there is at
12210least one character in the string.
12211[clinic start generated code]*/
12212
12213static PyObject *
12214unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012215/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 Py_ssize_t i, length;
12218 int kind;
12219 void *data;
12220
12221 if (PyUnicode_READY(self) == -1)
12222 return NULL;
12223 length = PyUnicode_GET_LENGTH(self);
12224 kind = PyUnicode_KIND(self);
12225 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (length == 1)
12229 return PyBool_FromLong(
12230 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012232 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012234 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 for (i = 0; i < length; i++) {
12237 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012238 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012240 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241}
12242
Martin v. Löwis47383402007-08-15 07:32:56 +000012243int
12244PyUnicode_IsIdentifier(PyObject *self)
12245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012247 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012248
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012249 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12250 if (len == 0) {
12251 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 }
12254
Hai Shi3d235f52020-02-17 21:41:15 +080012255 int kind = 0;
12256 void *data = NULL;
Andy Lester933fc53f2020-02-20 22:51:47 -060012257 const wchar_t *wstr = NULL;
12258 Py_UCS4 ch;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012259 if (ready) {
12260 kind = PyUnicode_KIND(self);
12261 data = PyUnicode_DATA(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012262 ch = PyUnicode_READ(kind, data, 0);
12263 }
12264 else {
Andy Lester933fc53f2020-02-20 22:51:47 -060012265 wstr = _PyUnicode_WSTR(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012266 ch = wstr[0];
12267 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012268 /* PEP 3131 says that the first character must be in
12269 XID_Start and subsequent characters in XID_Continue,
12270 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012271 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012272 letters, digits, underscore). However, given the current
12273 definition of XID_Start and XID_Continue, it is sufficient
12274 to check just for these, except that _ must be allowed
12275 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012276 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012277 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012278 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012279
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012280 for (i = 1; i < len; i++) {
12281 if (ready) {
12282 ch = PyUnicode_READ(kind, data, i);
12283 }
12284 else {
12285 ch = wstr[i];
12286 }
12287 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012289 }
12290 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012291 return 1;
12292}
12293
INADA Naoki3ae20562017-01-16 20:41:20 +090012294/*[clinic input]
12295str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012296
INADA Naoki3ae20562017-01-16 20:41:20 +090012297Return True if the string is a valid Python identifier, False otherwise.
12298
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012299Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012300such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012301[clinic start generated code]*/
12302
12303static PyObject *
12304unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012305/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012306{
12307 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12308}
12309
INADA Naoki3ae20562017-01-16 20:41:20 +090012310/*[clinic input]
12311str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012312
INADA Naoki3ae20562017-01-16 20:41:20 +090012313Return True if the string is printable, False otherwise.
12314
12315A string is printable if all of its characters are considered printable in
12316repr() or if it is empty.
12317[clinic start generated code]*/
12318
12319static PyObject *
12320unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012321/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 Py_ssize_t i, length;
12324 int kind;
12325 void *data;
12326
12327 if (PyUnicode_READY(self) == -1)
12328 return NULL;
12329 length = PyUnicode_GET_LENGTH(self);
12330 kind = PyUnicode_KIND(self);
12331 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012332
12333 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 if (length == 1)
12335 return PyBool_FromLong(
12336 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 for (i = 0; i < length; i++) {
12339 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012340 Py_RETURN_FALSE;
12341 }
12342 }
12343 Py_RETURN_TRUE;
12344}
12345
INADA Naoki3ae20562017-01-16 20:41:20 +090012346/*[clinic input]
12347str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348
INADA Naoki3ae20562017-01-16 20:41:20 +090012349 iterable: object
12350 /
12351
12352Concatenate any number of strings.
12353
Martin Panter91a88662017-01-24 00:30:06 +000012354The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012355The result is returned as a new string.
12356
12357Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12358[clinic start generated code]*/
12359
12360static PyObject *
12361unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012362/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363{
INADA Naoki3ae20562017-01-16 20:41:20 +090012364 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365}
12366
Martin v. Löwis18e16552006-02-15 17:27:45 +000012367static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012368unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 if (PyUnicode_READY(self) == -1)
12371 return -1;
12372 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373}
12374
INADA Naoki3ae20562017-01-16 20:41:20 +090012375/*[clinic input]
12376str.ljust as unicode_ljust
12377
12378 width: Py_ssize_t
12379 fillchar: Py_UCS4 = ' '
12380 /
12381
12382Return a left-justified string of length width.
12383
12384Padding is done using the specified fill character (default is a space).
12385[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386
12387static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012388unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12389/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012390{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012391 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393
Victor Stinnerc4b49542011-12-11 22:44:26 +010012394 if (PyUnicode_GET_LENGTH(self) >= width)
12395 return unicode_result_unchanged(self);
12396
12397 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398}
12399
INADA Naoki3ae20562017-01-16 20:41:20 +090012400/*[clinic input]
12401str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402
INADA Naoki3ae20562017-01-16 20:41:20 +090012403Return a copy of the string converted to lowercase.
12404[clinic start generated code]*/
12405
12406static PyObject *
12407unicode_lower_impl(PyObject *self)
12408/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012410 if (PyUnicode_READY(self) == -1)
12411 return NULL;
12412 if (PyUnicode_IS_ASCII(self))
12413 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012414 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415}
12416
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012417#define LEFTSTRIP 0
12418#define RIGHTSTRIP 1
12419#define BOTHSTRIP 2
12420
12421/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012422static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012423
INADA Naoki3ae20562017-01-16 20:41:20 +090012424#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012425
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012426/* externally visible for str.strip(unicode) */
12427PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012428_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 void *data;
12431 int kind;
12432 Py_ssize_t i, j, len;
12433 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012434 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12437 return NULL;
12438
12439 kind = PyUnicode_KIND(self);
12440 data = PyUnicode_DATA(self);
12441 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012442 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12444 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012445 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446
Benjamin Peterson14339b62009-01-31 16:36:08 +000012447 i = 0;
12448 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012449 while (i < len) {
12450 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12451 if (!BLOOM(sepmask, ch))
12452 break;
12453 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12454 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 i++;
12456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 j = len;
12460 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012461 j--;
12462 while (j >= i) {
12463 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12464 if (!BLOOM(sepmask, ch))
12465 break;
12466 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12467 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012469 }
12470
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012472 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473
Victor Stinner7931d9a2011-11-04 00:22:48 +010012474 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475}
12476
12477PyObject*
12478PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12479{
12480 unsigned char *data;
12481 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012482 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483
Victor Stinnerde636f32011-10-01 03:55:54 +020012484 if (PyUnicode_READY(self) == -1)
12485 return NULL;
12486
Victor Stinner684d5fd2012-05-03 02:32:34 +020012487 length = PyUnicode_GET_LENGTH(self);
12488 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012489
Victor Stinner684d5fd2012-05-03 02:32:34 +020012490 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012491 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492
Victor Stinnerde636f32011-10-01 03:55:54 +020012493 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012494 PyErr_SetString(PyExc_IndexError, "string index out of range");
12495 return NULL;
12496 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012497 if (start >= length || end < start)
12498 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012499
Victor Stinner684d5fd2012-05-03 02:32:34 +020012500 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012501 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012502 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012503 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012504 }
12505 else {
12506 kind = PyUnicode_KIND(self);
12507 data = PyUnicode_1BYTE_DATA(self);
12508 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012509 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012510 length);
12511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513
12514static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012515do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 Py_ssize_t len, i, j;
12518
12519 if (PyUnicode_READY(self) == -1)
12520 return NULL;
12521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012523
Victor Stinnercc7af722013-04-09 22:39:24 +020012524 if (PyUnicode_IS_ASCII(self)) {
12525 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12526
12527 i = 0;
12528 if (striptype != RIGHTSTRIP) {
12529 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012530 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012531 if (!_Py_ascii_whitespace[ch])
12532 break;
12533 i++;
12534 }
12535 }
12536
12537 j = len;
12538 if (striptype != LEFTSTRIP) {
12539 j--;
12540 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012541 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012542 if (!_Py_ascii_whitespace[ch])
12543 break;
12544 j--;
12545 }
12546 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012547 }
12548 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012549 else {
12550 int kind = PyUnicode_KIND(self);
12551 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012552
Victor Stinnercc7af722013-04-09 22:39:24 +020012553 i = 0;
12554 if (striptype != RIGHTSTRIP) {
12555 while (i < len) {
12556 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12557 if (!Py_UNICODE_ISSPACE(ch))
12558 break;
12559 i++;
12560 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012561 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012562
12563 j = len;
12564 if (striptype != LEFTSTRIP) {
12565 j--;
12566 while (j >= i) {
12567 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12568 if (!Py_UNICODE_ISSPACE(ch))
12569 break;
12570 j--;
12571 }
12572 j++;
12573 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012574 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012575
Victor Stinner7931d9a2011-11-04 00:22:48 +010012576 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012579
12580static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012581do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012582{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012583 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012584 if (PyUnicode_Check(sep))
12585 return _PyUnicode_XStrip(self, striptype, sep);
12586 else {
12587 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 "%s arg must be None or str",
12589 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590 return NULL;
12591 }
12592 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012593
Benjamin Peterson14339b62009-01-31 16:36:08 +000012594 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012595}
12596
12597
INADA Naoki3ae20562017-01-16 20:41:20 +090012598/*[clinic input]
12599str.strip as unicode_strip
12600
12601 chars: object = None
12602 /
12603
Zachary Ware09895c22019-10-09 16:09:00 -050012604Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012605
12606If chars is given and not None, remove characters in chars instead.
12607[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012608
12609static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012610unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012611/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012612{
INADA Naoki3ae20562017-01-16 20:41:20 +090012613 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012614}
12615
12616
INADA Naoki3ae20562017-01-16 20:41:20 +090012617/*[clinic input]
12618str.lstrip as unicode_lstrip
12619
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012620 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012621 /
12622
12623Return a copy of the string with leading whitespace removed.
12624
12625If chars is given and not None, remove characters in chars instead.
12626[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012627
12628static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012629unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012630/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012631{
INADA Naoki3ae20562017-01-16 20:41:20 +090012632 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012633}
12634
12635
INADA Naoki3ae20562017-01-16 20:41:20 +090012636/*[clinic input]
12637str.rstrip as unicode_rstrip
12638
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012639 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012640 /
12641
12642Return a copy of the string with trailing whitespace removed.
12643
12644If chars is given and not None, remove characters in chars instead.
12645[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012646
12647static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012648unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012649/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012650{
INADA Naoki3ae20562017-01-16 20:41:20 +090012651 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012652}
12653
12654
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012656unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012658 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660
Serhiy Storchaka05997252013-01-26 12:14:02 +020012661 if (len < 1)
12662 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
Victor Stinnerc4b49542011-12-11 22:44:26 +010012664 /* no repeat, return original string */
12665 if (len == 1)
12666 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012667
Benjamin Petersonbac79492012-01-14 13:34:47 -050012668 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 return NULL;
12670
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012671 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012672 PyErr_SetString(PyExc_OverflowError,
12673 "repeated string is too long");
12674 return NULL;
12675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012677
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012678 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679 if (!u)
12680 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012681 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 if (PyUnicode_GET_LENGTH(str) == 1) {
12684 const int kind = PyUnicode_KIND(str);
12685 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012686 if (kind == PyUnicode_1BYTE_KIND) {
12687 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012688 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012689 }
12690 else if (kind == PyUnicode_2BYTE_KIND) {
12691 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012692 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012693 ucs2[n] = fill_char;
12694 } else {
12695 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12696 assert(kind == PyUnicode_4BYTE_KIND);
12697 for (n = 0; n < len; ++n)
12698 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 }
12701 else {
12702 /* number of characters copied this far */
12703 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012704 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012706 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012710 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713 }
12714
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012715 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012716 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717}
12718
Alexander Belopolsky40018472011-02-26 01:02:56 +000012719PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012720PyUnicode_Replace(PyObject *str,
12721 PyObject *substr,
12722 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012723 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012725 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12726 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012728 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729}
12730
INADA Naoki3ae20562017-01-16 20:41:20 +090012731/*[clinic input]
12732str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733
INADA Naoki3ae20562017-01-16 20:41:20 +090012734 old: unicode
12735 new: unicode
12736 count: Py_ssize_t = -1
12737 Maximum number of occurrences to replace.
12738 -1 (the default value) means replace all occurrences.
12739 /
12740
12741Return a copy with all occurrences of substring old replaced by new.
12742
12743If the optional argument count is given, only the first count occurrences are
12744replaced.
12745[clinic start generated code]*/
12746
12747static PyObject *
12748unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12749 Py_ssize_t count)
12750/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012752 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012753 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012754 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755}
12756
Alexander Belopolsky40018472011-02-26 01:02:56 +000012757static PyObject *
12758unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012760 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 Py_ssize_t isize;
12762 Py_ssize_t osize, squote, dquote, i, o;
12763 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012764 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012768 return NULL;
12769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 isize = PyUnicode_GET_LENGTH(unicode);
12771 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 /* Compute length of output, quote characters, and
12774 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012775 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 max = 127;
12777 squote = dquote = 0;
12778 ikind = PyUnicode_KIND(unicode);
12779 for (i = 0; i < isize; i++) {
12780 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012781 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012783 case '\'': squote++; break;
12784 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012786 incr = 2;
12787 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 default:
12789 /* Fast-path ASCII */
12790 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012791 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012793 ;
12794 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012797 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012799 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012801 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012803 if (osize > PY_SSIZE_T_MAX - incr) {
12804 PyErr_SetString(PyExc_OverflowError,
12805 "string is too long to generate repr");
12806 return NULL;
12807 }
12808 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 }
12810
12811 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012812 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012814 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 if (dquote)
12816 /* Both squote and dquote present. Use squote,
12817 and escape them */
12818 osize += squote;
12819 else
12820 quote = '"';
12821 }
Victor Stinner55c08782013-04-14 18:45:39 +020012822 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823
12824 repr = PyUnicode_New(osize, max);
12825 if (repr == NULL)
12826 return NULL;
12827 okind = PyUnicode_KIND(repr);
12828 odata = PyUnicode_DATA(repr);
12829
12830 PyUnicode_WRITE(okind, odata, 0, quote);
12831 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012832 if (unchanged) {
12833 _PyUnicode_FastCopyCharacters(repr, 1,
12834 unicode, 0,
12835 isize);
12836 }
12837 else {
12838 for (i = 0, o = 1; i < isize; i++) {
12839 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840
Victor Stinner55c08782013-04-14 18:45:39 +020012841 /* Escape quotes and backslashes */
12842 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012843 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012845 continue;
12846 }
12847
12848 /* Map special whitespace to '\t', \n', '\r' */
12849 if (ch == '\t') {
12850 PyUnicode_WRITE(okind, odata, o++, '\\');
12851 PyUnicode_WRITE(okind, odata, o++, 't');
12852 }
12853 else if (ch == '\n') {
12854 PyUnicode_WRITE(okind, odata, o++, '\\');
12855 PyUnicode_WRITE(okind, odata, o++, 'n');
12856 }
12857 else if (ch == '\r') {
12858 PyUnicode_WRITE(okind, odata, o++, '\\');
12859 PyUnicode_WRITE(okind, odata, o++, 'r');
12860 }
12861
12862 /* Map non-printable US ASCII to '\xhh' */
12863 else if (ch < ' ' || ch == 0x7F) {
12864 PyUnicode_WRITE(okind, odata, o++, '\\');
12865 PyUnicode_WRITE(okind, odata, o++, 'x');
12866 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12867 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12868 }
12869
12870 /* Copy ASCII characters as-is */
12871 else if (ch < 0x7F) {
12872 PyUnicode_WRITE(okind, odata, o++, ch);
12873 }
12874
12875 /* Non-ASCII characters */
12876 else {
12877 /* Map Unicode whitespace and control characters
12878 (categories Z* and C* except ASCII space)
12879 */
12880 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12881 PyUnicode_WRITE(okind, odata, o++, '\\');
12882 /* Map 8-bit characters to '\xhh' */
12883 if (ch <= 0xff) {
12884 PyUnicode_WRITE(okind, odata, o++, 'x');
12885 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12886 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12887 }
12888 /* Map 16-bit characters to '\uxxxx' */
12889 else if (ch <= 0xffff) {
12890 PyUnicode_WRITE(okind, odata, o++, 'u');
12891 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12892 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12893 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12894 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12895 }
12896 /* Map 21-bit characters to '\U00xxxxxx' */
12897 else {
12898 PyUnicode_WRITE(okind, odata, o++, 'U');
12899 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12900 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12901 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12902 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12903 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12904 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12905 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12906 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12907 }
12908 }
12909 /* Copy characters as-is */
12910 else {
12911 PyUnicode_WRITE(okind, odata, o++, ch);
12912 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012913 }
12914 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012917 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012918 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919}
12920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012921PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012922 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923\n\
12924Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012925such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926arguments start and end are interpreted as in slice notation.\n\
12927\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012928Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929
12930static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012933 /* initialize variables to prevent gcc warning */
12934 PyObject *substring = NULL;
12935 Py_ssize_t start = 0;
12936 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012937 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012939 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012942 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012945 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947 if (result == -2)
12948 return NULL;
12949
Christian Heimes217cfd12007-12-02 14:31:20 +000012950 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951}
12952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012953PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012956Return the highest index in S where substring sub is found,\n\
12957such that sub is contained within S[start:end]. Optional\n\
12958arguments start and end are interpreted as in slice notation.\n\
12959\n\
12960Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961
12962static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012963unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012965 /* initialize variables to prevent gcc warning */
12966 PyObject *substring = NULL;
12967 Py_ssize_t start = 0;
12968 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012971 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012974 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012977 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 if (result == -2)
12980 return NULL;
12981
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982 if (result < 0) {
12983 PyErr_SetString(PyExc_ValueError, "substring not found");
12984 return NULL;
12985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986
Christian Heimes217cfd12007-12-02 14:31:20 +000012987 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988}
12989
INADA Naoki3ae20562017-01-16 20:41:20 +090012990/*[clinic input]
12991str.rjust as unicode_rjust
12992
12993 width: Py_ssize_t
12994 fillchar: Py_UCS4 = ' '
12995 /
12996
12997Return a right-justified string of length width.
12998
12999Padding is done using the specified fill character (default is a space).
13000[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001
13002static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013003unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13004/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013006 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007 return NULL;
13008
Victor Stinnerc4b49542011-12-11 22:44:26 +010013009 if (PyUnicode_GET_LENGTH(self) >= width)
13010 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011
Victor Stinnerc4b49542011-12-11 22:44:26 +010013012 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013}
13014
Alexander Belopolsky40018472011-02-26 01:02:56 +000013015PyObject *
13016PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013018 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013021 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022}
13023
INADA Naoki3ae20562017-01-16 20:41:20 +090013024/*[clinic input]
13025str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026
INADA Naoki3ae20562017-01-16 20:41:20 +090013027 sep: object = None
13028 The delimiter according which to split the string.
13029 None (the default value) means split according to any whitespace,
13030 and discard empty strings from the result.
13031 maxsplit: Py_ssize_t = -1
13032 Maximum number of splits to do.
13033 -1 (the default value) means no limit.
13034
13035Return a list of the words in the string, using sep as the delimiter string.
13036[clinic start generated code]*/
13037
13038static PyObject *
13039unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13040/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041{
INADA Naoki3ae20562017-01-16 20:41:20 +090013042 if (sep == Py_None)
13043 return split(self, NULL, maxsplit);
13044 if (PyUnicode_Check(sep))
13045 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013046
Victor Stinner998b8062018-09-12 00:23:25 +020013047 PyErr_Format(PyExc_TypeError,
13048 "must be str or None, not %.100s",
13049 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051}
13052
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013054PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013057 int kind1, kind2;
13058 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013061 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063
Victor Stinner14f8f022011-10-05 20:58:25 +020013064 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 len1 = PyUnicode_GET_LENGTH(str_obj);
13067 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013068 if (kind1 < kind2 || len1 < len2) {
13069 _Py_INCREF_UNICODE_EMPTY();
13070 if (!unicode_empty)
13071 out = NULL;
13072 else {
13073 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13074 Py_DECREF(unicode_empty);
13075 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013076 return out;
13077 }
13078 buf1 = PyUnicode_DATA(str_obj);
13079 buf2 = PyUnicode_DATA(sep_obj);
13080 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013081 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013082 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013083 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013086 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013088 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13089 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13090 else
13091 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 break;
13093 case PyUnicode_2BYTE_KIND:
13094 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13095 break;
13096 case PyUnicode_4BYTE_KIND:
13097 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13098 break;
13099 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013100 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013102
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013103 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013105
13106 return out;
13107}
13108
13109
13110PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013111PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013112{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013113 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013114 int kind1, kind2;
13115 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013117
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013118 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013119 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013120
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013121 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 len1 = PyUnicode_GET_LENGTH(str_obj);
13124 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013125 if (kind1 < kind2 || len1 < len2) {
13126 _Py_INCREF_UNICODE_EMPTY();
13127 if (!unicode_empty)
13128 out = NULL;
13129 else {
13130 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13131 Py_DECREF(unicode_empty);
13132 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013133 return out;
13134 }
13135 buf1 = PyUnicode_DATA(str_obj);
13136 buf2 = PyUnicode_DATA(sep_obj);
13137 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013138 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013139 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013140 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013143 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013145 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13146 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13147 else
13148 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 break;
13150 case PyUnicode_2BYTE_KIND:
13151 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13152 break;
13153 case PyUnicode_4BYTE_KIND:
13154 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13155 break;
13156 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013157 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013159
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013160 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013162
13163 return out;
13164}
13165
INADA Naoki3ae20562017-01-16 20:41:20 +090013166/*[clinic input]
13167str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013168
INADA Naoki3ae20562017-01-16 20:41:20 +090013169 sep: object
13170 /
13171
13172Partition the string into three parts using the given separator.
13173
13174This will search for the separator in the string. If the separator is found,
13175returns a 3-tuple containing the part before the separator, the separator
13176itself, and the part after it.
13177
13178If the separator is not found, returns a 3-tuple containing the original string
13179and two empty strings.
13180[clinic start generated code]*/
13181
13182static PyObject *
13183unicode_partition(PyObject *self, PyObject *sep)
13184/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013185{
INADA Naoki3ae20562017-01-16 20:41:20 +090013186 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013187}
13188
INADA Naoki3ae20562017-01-16 20:41:20 +090013189/*[clinic input]
13190str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013191
INADA Naoki3ae20562017-01-16 20:41:20 +090013192Partition the string into three parts using the given separator.
13193
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013194This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013195the separator is found, returns a 3-tuple containing the part before the
13196separator, the separator itself, and the part after it.
13197
13198If the separator is not found, returns a 3-tuple containing two empty strings
13199and the original string.
13200[clinic start generated code]*/
13201
13202static PyObject *
13203unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013204/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013205{
INADA Naoki3ae20562017-01-16 20:41:20 +090013206 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013207}
13208
Alexander Belopolsky40018472011-02-26 01:02:56 +000013209PyObject *
13210PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013211{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013212 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013213 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013214
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013215 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013216}
13217
INADA Naoki3ae20562017-01-16 20:41:20 +090013218/*[clinic input]
13219str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013220
INADA Naoki3ae20562017-01-16 20:41:20 +090013221Return a list of the words in the string, using sep as the delimiter string.
13222
13223Splits are done starting at the end of the string and working to the front.
13224[clinic start generated code]*/
13225
13226static PyObject *
13227unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13228/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013229{
INADA Naoki3ae20562017-01-16 20:41:20 +090013230 if (sep == Py_None)
13231 return rsplit(self, NULL, maxsplit);
13232 if (PyUnicode_Check(sep))
13233 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013234
Victor Stinner998b8062018-09-12 00:23:25 +020013235 PyErr_Format(PyExc_TypeError,
13236 "must be str or None, not %.100s",
13237 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013238 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013239}
13240
INADA Naoki3ae20562017-01-16 20:41:20 +090013241/*[clinic input]
13242str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013244 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013245
13246Return a list of the lines in the string, breaking at line boundaries.
13247
13248Line breaks are not included in the resulting list unless keepends is given and
13249true.
13250[clinic start generated code]*/
13251
13252static PyObject *
13253unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013254/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013256 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257}
13258
13259static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013260PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013262 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263}
13264
INADA Naoki3ae20562017-01-16 20:41:20 +090013265/*[clinic input]
13266str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267
INADA Naoki3ae20562017-01-16 20:41:20 +090013268Convert uppercase characters to lowercase and lowercase characters to uppercase.
13269[clinic start generated code]*/
13270
13271static PyObject *
13272unicode_swapcase_impl(PyObject *self)
13273/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013275 if (PyUnicode_READY(self) == -1)
13276 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013277 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278}
13279
Larry Hastings61272b72014-01-07 12:41:53 -080013280/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013281
Larry Hastings31826802013-10-19 00:09:25 -070013282@staticmethod
13283str.maketrans as unicode_maketrans
13284
13285 x: object
13286
13287 y: unicode=NULL
13288
13289 z: unicode=NULL
13290
13291 /
13292
13293Return a translation table usable for str.translate().
13294
13295If there is only one argument, it must be a dictionary mapping Unicode
13296ordinals (integers) or characters to Unicode ordinals, strings or None.
13297Character keys will be then converted to ordinals.
13298If there are two arguments, they must be strings of equal length, and
13299in the resulting dictionary, each character in x will be mapped to the
13300character at the same position in y. If there is a third argument, it
13301must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013302[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013303
Larry Hastings31826802013-10-19 00:09:25 -070013304static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013305unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013306/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013307{
Georg Brandlceee0772007-11-27 23:48:05 +000013308 PyObject *new = NULL, *key, *value;
13309 Py_ssize_t i = 0;
13310 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311
Georg Brandlceee0772007-11-27 23:48:05 +000013312 new = PyDict_New();
13313 if (!new)
13314 return NULL;
13315 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 int x_kind, y_kind, z_kind;
13317 void *x_data, *y_data, *z_data;
13318
Georg Brandlceee0772007-11-27 23:48:05 +000013319 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013320 if (!PyUnicode_Check(x)) {
13321 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13322 "be a string if there is a second argument");
13323 goto err;
13324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013326 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13327 "arguments must have equal length");
13328 goto err;
13329 }
13330 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 x_kind = PyUnicode_KIND(x);
13332 y_kind = PyUnicode_KIND(y);
13333 x_data = PyUnicode_DATA(x);
13334 y_data = PyUnicode_DATA(y);
13335 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13336 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013337 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013338 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013339 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013340 if (!value) {
13341 Py_DECREF(key);
13342 goto err;
13343 }
Georg Brandlceee0772007-11-27 23:48:05 +000013344 res = PyDict_SetItem(new, key, value);
13345 Py_DECREF(key);
13346 Py_DECREF(value);
13347 if (res < 0)
13348 goto err;
13349 }
13350 /* create entries for deleting chars in z */
13351 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013352 z_kind = PyUnicode_KIND(z);
13353 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013354 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013356 if (!key)
13357 goto err;
13358 res = PyDict_SetItem(new, key, Py_None);
13359 Py_DECREF(key);
13360 if (res < 0)
13361 goto err;
13362 }
13363 }
13364 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 int kind;
13366 void *data;
13367
Georg Brandlceee0772007-11-27 23:48:05 +000013368 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013369 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013370 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13371 "to maketrans it must be a dict");
13372 goto err;
13373 }
13374 /* copy entries into the new dict, converting string keys to int keys */
13375 while (PyDict_Next(x, &i, &key, &value)) {
13376 if (PyUnicode_Check(key)) {
13377 /* convert string keys to integer keys */
13378 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013379 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013380 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13381 "table must be of length 1");
13382 goto err;
13383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013384 kind = PyUnicode_KIND(key);
13385 data = PyUnicode_DATA(key);
13386 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013387 if (!newkey)
13388 goto err;
13389 res = PyDict_SetItem(new, newkey, value);
13390 Py_DECREF(newkey);
13391 if (res < 0)
13392 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013393 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013394 /* just keep integer keys */
13395 if (PyDict_SetItem(new, key, value) < 0)
13396 goto err;
13397 } else {
13398 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13399 "be strings or integers");
13400 goto err;
13401 }
13402 }
13403 }
13404 return new;
13405 err:
13406 Py_DECREF(new);
13407 return NULL;
13408}
13409
INADA Naoki3ae20562017-01-16 20:41:20 +090013410/*[clinic input]
13411str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013412
INADA Naoki3ae20562017-01-16 20:41:20 +090013413 table: object
13414 Translation table, which must be a mapping of Unicode ordinals to
13415 Unicode ordinals, strings, or None.
13416 /
13417
13418Replace each character in the string using the given translation table.
13419
13420The table must implement lookup/indexing via __getitem__, for instance a
13421dictionary or list. If this operation raises LookupError, the character is
13422left untouched. Characters mapped to None are deleted.
13423[clinic start generated code]*/
13424
13425static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013427/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013429 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013430}
13431
INADA Naoki3ae20562017-01-16 20:41:20 +090013432/*[clinic input]
13433str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434
INADA Naoki3ae20562017-01-16 20:41:20 +090013435Return a copy of the string converted to uppercase.
13436[clinic start generated code]*/
13437
13438static PyObject *
13439unicode_upper_impl(PyObject *self)
13440/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013442 if (PyUnicode_READY(self) == -1)
13443 return NULL;
13444 if (PyUnicode_IS_ASCII(self))
13445 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013446 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447}
13448
INADA Naoki3ae20562017-01-16 20:41:20 +090013449/*[clinic input]
13450str.zfill as unicode_zfill
13451
13452 width: Py_ssize_t
13453 /
13454
13455Pad a numeric string with zeros on the left, to fill a field of the given width.
13456
13457The string is never truncated.
13458[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459
13460static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013461unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013462/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013464 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013465 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013466 int kind;
13467 void *data;
13468 Py_UCS4 chr;
13469
Benjamin Petersonbac79492012-01-14 13:34:47 -050013470 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013472
Victor Stinnerc4b49542011-12-11 22:44:26 +010013473 if (PyUnicode_GET_LENGTH(self) >= width)
13474 return unicode_result_unchanged(self);
13475
13476 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013477
13478 u = pad(self, fill, 0, '0');
13479
Walter Dörwald068325e2002-04-15 13:36:47 +000013480 if (u == NULL)
13481 return NULL;
13482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013483 kind = PyUnicode_KIND(u);
13484 data = PyUnicode_DATA(u);
13485 chr = PyUnicode_READ(kind, data, fill);
13486
13487 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013488 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 PyUnicode_WRITE(kind, data, 0, chr);
13490 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013491 }
13492
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013493 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013494 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013495}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013496
13497#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013498static PyObject *
13499unicode__decimal2ascii(PyObject *self)
13500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013501 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013502}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013503#endif
13504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013505PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013508Return True if S starts with the specified prefix, False otherwise.\n\
13509With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013510With optional end, stop comparing S at that position.\n\
13511prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013512
13513static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013514unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013516{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013517 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013518 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013519 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013520 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013521 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013522
Jesus Ceaac451502011-04-20 17:09:23 +020013523 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013525 if (PyTuple_Check(subobj)) {
13526 Py_ssize_t i;
13527 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013528 substring = PyTuple_GET_ITEM(subobj, i);
13529 if (!PyUnicode_Check(substring)) {
13530 PyErr_Format(PyExc_TypeError,
13531 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013532 "not %.100s",
13533 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013534 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013535 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013536 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013537 if (result == -1)
13538 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013539 if (result) {
13540 Py_RETURN_TRUE;
13541 }
13542 }
13543 /* nothing matched */
13544 Py_RETURN_FALSE;
13545 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013546 if (!PyUnicode_Check(subobj)) {
13547 PyErr_Format(PyExc_TypeError,
13548 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013549 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013551 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013552 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013553 if (result == -1)
13554 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013555 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013556}
13557
13558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013559PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013561\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013562Return True if S ends with the specified suffix, False otherwise.\n\
13563With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013564With optional end, stop comparing S at that position.\n\
13565suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013566
13567static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013568unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013570{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013571 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013572 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013573 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013574 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013575 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576
Jesus Ceaac451502011-04-20 17:09:23 +020013577 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013579 if (PyTuple_Check(subobj)) {
13580 Py_ssize_t i;
13581 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013582 substring = PyTuple_GET_ITEM(subobj, i);
13583 if (!PyUnicode_Check(substring)) {
13584 PyErr_Format(PyExc_TypeError,
13585 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013586 "not %.100s",
13587 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013589 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013590 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013591 if (result == -1)
13592 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013593 if (result) {
13594 Py_RETURN_TRUE;
13595 }
13596 }
13597 Py_RETURN_FALSE;
13598 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013599 if (!PyUnicode_Check(subobj)) {
13600 PyErr_Format(PyExc_TypeError,
13601 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013602 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013604 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013605 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013606 if (result == -1)
13607 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013608 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013609}
13610
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013611static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013612_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013613{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013614 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13615 writer->data = PyUnicode_DATA(writer->buffer);
13616
13617 if (!writer->readonly) {
13618 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013619 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013620 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013621 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013622 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13623 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13624 writer->kind = PyUnicode_WCHAR_KIND;
13625 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13626
Victor Stinner8f674cc2013-04-17 23:02:17 +020013627 /* Copy-on-write mode: set buffer size to 0 so
13628 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13629 * next write. */
13630 writer->size = 0;
13631 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013632}
13633
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013635_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013636{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013637 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013638
13639 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013640 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013641
13642 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13643 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13644 writer->kind = PyUnicode_WCHAR_KIND;
13645 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013646}
13647
Inada Naoki770847a2019-06-24 12:30:24 +090013648// Initialize _PyUnicodeWriter with initial buffer
13649static inline void
13650_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13651{
13652 memset(writer, 0, sizeof(*writer));
13653 writer->buffer = buffer;
13654 _PyUnicodeWriter_Update(writer);
13655 writer->min_length = writer->size;
13656}
13657
Victor Stinnerd3f08822012-05-29 12:57:52 +020013658int
13659_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13660 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013661{
13662 Py_ssize_t newlen;
13663 PyObject *newbuffer;
13664
Victor Stinner2740e462016-09-06 16:58:36 -070013665 assert(maxchar <= MAX_UNICODE);
13666
Victor Stinnerca9381e2015-09-22 00:58:32 +020013667 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013668 assert((maxchar > writer->maxchar && length >= 0)
13669 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013670
Victor Stinner202fdca2012-05-07 12:47:02 +020013671 if (length > PY_SSIZE_T_MAX - writer->pos) {
13672 PyErr_NoMemory();
13673 return -1;
13674 }
13675 newlen = writer->pos + length;
13676
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013677 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013678
Victor Stinnerd3f08822012-05-29 12:57:52 +020013679 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013680 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013681 if (writer->overallocate
13682 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13683 /* overallocate to limit the number of realloc() */
13684 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013685 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013686 if (newlen < writer->min_length)
13687 newlen = writer->min_length;
13688
Victor Stinnerd3f08822012-05-29 12:57:52 +020013689 writer->buffer = PyUnicode_New(newlen, maxchar);
13690 if (writer->buffer == NULL)
13691 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013692 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013693 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013694 if (writer->overallocate
13695 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13696 /* overallocate to limit the number of realloc() */
13697 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013698 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013699 if (newlen < writer->min_length)
13700 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013701
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013702 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013703 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013704 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013705 newbuffer = PyUnicode_New(newlen, maxchar);
13706 if (newbuffer == NULL)
13707 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013708 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13709 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013710 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013711 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013712 }
13713 else {
13714 newbuffer = resize_compact(writer->buffer, newlen);
13715 if (newbuffer == NULL)
13716 return -1;
13717 }
13718 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013719 }
13720 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013721 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013722 newbuffer = PyUnicode_New(writer->size, maxchar);
13723 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013724 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013725 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13726 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013727 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013728 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013729 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013730 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013731
13732#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013733}
13734
Victor Stinnerca9381e2015-09-22 00:58:32 +020013735int
13736_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13737 enum PyUnicode_Kind kind)
13738{
13739 Py_UCS4 maxchar;
13740
13741 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13742 assert(writer->kind < kind);
13743
13744 switch (kind)
13745 {
13746 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13747 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13748 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13749 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013750 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013751 }
13752
13753 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13754}
13755
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013756static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013757_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013758{
Victor Stinner2740e462016-09-06 16:58:36 -070013759 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013760 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13761 return -1;
13762 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13763 writer->pos++;
13764 return 0;
13765}
13766
13767int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013768_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13769{
13770 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13771}
13772
13773int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013774_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13775{
13776 Py_UCS4 maxchar;
13777 Py_ssize_t len;
13778
13779 if (PyUnicode_READY(str) == -1)
13780 return -1;
13781 len = PyUnicode_GET_LENGTH(str);
13782 if (len == 0)
13783 return 0;
13784 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13785 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013786 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013787 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013788 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013789 Py_INCREF(str);
13790 writer->buffer = str;
13791 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013792 writer->pos += len;
13793 return 0;
13794 }
13795 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13796 return -1;
13797 }
13798 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13799 str, 0, len);
13800 writer->pos += len;
13801 return 0;
13802}
13803
Victor Stinnere215d962012-10-06 23:03:36 +020013804int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013805_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13806 Py_ssize_t start, Py_ssize_t end)
13807{
13808 Py_UCS4 maxchar;
13809 Py_ssize_t len;
13810
13811 if (PyUnicode_READY(str) == -1)
13812 return -1;
13813
13814 assert(0 <= start);
13815 assert(end <= PyUnicode_GET_LENGTH(str));
13816 assert(start <= end);
13817
13818 if (end == 0)
13819 return 0;
13820
13821 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13822 return _PyUnicodeWriter_WriteStr(writer, str);
13823
13824 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13825 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13826 else
13827 maxchar = writer->maxchar;
13828 len = end - start;
13829
13830 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13831 return -1;
13832
13833 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13834 str, start, len);
13835 writer->pos += len;
13836 return 0;
13837}
13838
13839int
Victor Stinner4a587072013-11-19 12:54:53 +010013840_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13841 const char *ascii, Py_ssize_t len)
13842{
13843 if (len == -1)
13844 len = strlen(ascii);
13845
Andy Lestere6be9b52020-02-11 20:28:35 -060013846 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010013847
13848 if (writer->buffer == NULL && !writer->overallocate) {
13849 PyObject *str;
13850
13851 str = _PyUnicode_FromASCII(ascii, len);
13852 if (str == NULL)
13853 return -1;
13854
13855 writer->readonly = 1;
13856 writer->buffer = str;
13857 _PyUnicodeWriter_Update(writer);
13858 writer->pos += len;
13859 return 0;
13860 }
13861
13862 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13863 return -1;
13864
13865 switch (writer->kind)
13866 {
13867 case PyUnicode_1BYTE_KIND:
13868 {
13869 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13870 Py_UCS1 *data = writer->data;
13871
Christian Heimesf051e432016-09-13 20:22:02 +020013872 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013873 break;
13874 }
13875 case PyUnicode_2BYTE_KIND:
13876 {
13877 _PyUnicode_CONVERT_BYTES(
13878 Py_UCS1, Py_UCS2,
13879 ascii, ascii + len,
13880 (Py_UCS2 *)writer->data + writer->pos);
13881 break;
13882 }
13883 case PyUnicode_4BYTE_KIND:
13884 {
13885 _PyUnicode_CONVERT_BYTES(
13886 Py_UCS1, Py_UCS4,
13887 ascii, ascii + len,
13888 (Py_UCS4 *)writer->data + writer->pos);
13889 break;
13890 }
13891 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013892 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013893 }
13894
13895 writer->pos += len;
13896 return 0;
13897}
13898
13899int
13900_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13901 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013902{
13903 Py_UCS4 maxchar;
13904
Andy Lestere6be9b52020-02-11 20:28:35 -060013905 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020013906 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13907 return -1;
13908 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13909 writer->pos += len;
13910 return 0;
13911}
13912
Victor Stinnerd3f08822012-05-29 12:57:52 +020013913PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013914_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013915{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013916 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013917
Victor Stinnerd3f08822012-05-29 12:57:52 +020013918 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013919 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013920 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013921 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013922
13923 str = writer->buffer;
13924 writer->buffer = NULL;
13925
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013926 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013927 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13928 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013929 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013930
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013931 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13932 PyObject *str2;
13933 str2 = resize_compact(str, writer->pos);
13934 if (str2 == NULL) {
13935 Py_DECREF(str);
13936 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013937 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013938 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013939 }
13940
Victor Stinner15a0bd32013-07-08 22:29:55 +020013941 assert(_PyUnicode_CheckConsistency(str, 1));
13942 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013943}
13944
Victor Stinnerd3f08822012-05-29 12:57:52 +020013945void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013946_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013947{
13948 Py_CLEAR(writer->buffer);
13949}
13950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013951#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013952
13953PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013954 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013955\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013956Return a formatted version of S, using substitutions from args and kwargs.\n\
13957The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013958
Eric Smith27bbca62010-11-04 17:06:58 +000013959PyDoc_STRVAR(format_map__doc__,
13960 "S.format_map(mapping) -> str\n\
13961\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013962Return a formatted version of S, using substitutions from mapping.\n\
13963The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013964
INADA Naoki3ae20562017-01-16 20:41:20 +090013965/*[clinic input]
13966str.__format__ as unicode___format__
13967
13968 format_spec: unicode
13969 /
13970
13971Return a formatted version of the string as described by format_spec.
13972[clinic start generated code]*/
13973
Eric Smith4a7d76d2008-05-30 18:10:19 +000013974static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013975unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013976/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013977{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013978 _PyUnicodeWriter writer;
13979 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013980
Victor Stinnerd3f08822012-05-29 12:57:52 +020013981 if (PyUnicode_READY(self) == -1)
13982 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013983 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013984 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13985 self, format_spec, 0,
13986 PyUnicode_GET_LENGTH(format_spec));
13987 if (ret == -1) {
13988 _PyUnicodeWriter_Dealloc(&writer);
13989 return NULL;
13990 }
13991 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013992}
13993
INADA Naoki3ae20562017-01-16 20:41:20 +090013994/*[clinic input]
13995str.__sizeof__ as unicode_sizeof
13996
13997Return the size of the string in memory, in bytes.
13998[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013999
14000static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014001unicode_sizeof_impl(PyObject *self)
14002/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014003{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014004 Py_ssize_t size;
14005
14006 /* If it's a compact object, account for base structure +
14007 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014008 if (PyUnicode_IS_COMPACT_ASCII(self))
14009 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14010 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014011 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014012 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014013 else {
14014 /* If it is a two-block object, account for base object, and
14015 for character block if present. */
14016 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014017 if (_PyUnicode_DATA_ANY(self))
14018 size += (PyUnicode_GET_LENGTH(self) + 1) *
14019 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014020 }
14021 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014022 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014023 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14024 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14025 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14026 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014027
14028 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014029}
14030
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014031static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014032unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014033{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014034 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014035 if (!copy)
14036 return NULL;
14037 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014038}
14039
Guido van Rossumd57fd912000-03-10 22:53:23 +000014040static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014041 UNICODE_ENCODE_METHODDEF
14042 UNICODE_REPLACE_METHODDEF
14043 UNICODE_SPLIT_METHODDEF
14044 UNICODE_RSPLIT_METHODDEF
14045 UNICODE_JOIN_METHODDEF
14046 UNICODE_CAPITALIZE_METHODDEF
14047 UNICODE_CASEFOLD_METHODDEF
14048 UNICODE_TITLE_METHODDEF
14049 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014050 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014051 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014052 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014053 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014054 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014055 UNICODE_LJUST_METHODDEF
14056 UNICODE_LOWER_METHODDEF
14057 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014058 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14059 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014060 UNICODE_RJUST_METHODDEF
14061 UNICODE_RSTRIP_METHODDEF
14062 UNICODE_RPARTITION_METHODDEF
14063 UNICODE_SPLITLINES_METHODDEF
14064 UNICODE_STRIP_METHODDEF
14065 UNICODE_SWAPCASE_METHODDEF
14066 UNICODE_TRANSLATE_METHODDEF
14067 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014068 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14069 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014070 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014071 UNICODE_ISLOWER_METHODDEF
14072 UNICODE_ISUPPER_METHODDEF
14073 UNICODE_ISTITLE_METHODDEF
14074 UNICODE_ISSPACE_METHODDEF
14075 UNICODE_ISDECIMAL_METHODDEF
14076 UNICODE_ISDIGIT_METHODDEF
14077 UNICODE_ISNUMERIC_METHODDEF
14078 UNICODE_ISALPHA_METHODDEF
14079 UNICODE_ISALNUM_METHODDEF
14080 UNICODE_ISIDENTIFIER_METHODDEF
14081 UNICODE_ISPRINTABLE_METHODDEF
14082 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014083 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014084 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014085 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014086 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014087 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014088#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014089 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014090 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014091#endif
14092
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014093 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094 {NULL, NULL}
14095};
14096
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014097static PyObject *
14098unicode_mod(PyObject *v, PyObject *w)
14099{
Brian Curtindfc80e32011-08-10 20:28:54 -050014100 if (!PyUnicode_Check(v))
14101 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014102 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014103}
14104
14105static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 0, /*nb_add*/
14107 0, /*nb_subtract*/
14108 0, /*nb_multiply*/
14109 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014110};
14111
Guido van Rossumd57fd912000-03-10 22:53:23 +000014112static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 (lenfunc) unicode_length, /* sq_length */
14114 PyUnicode_Concat, /* sq_concat */
14115 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14116 (ssizeargfunc) unicode_getitem, /* sq_item */
14117 0, /* sq_slice */
14118 0, /* sq_ass_item */
14119 0, /* sq_ass_slice */
14120 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014121};
14122
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014123static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014124unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014126 if (PyUnicode_READY(self) == -1)
14127 return NULL;
14128
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014129 if (PyIndex_Check(item)) {
14130 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014131 if (i == -1 && PyErr_Occurred())
14132 return NULL;
14133 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014134 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014135 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014136 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014137 Py_ssize_t start, stop, step, slicelength, i;
14138 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014139 PyObject *result;
14140 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014141 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014142 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014143
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014144 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014145 return NULL;
14146 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014147 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14148 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014149
14150 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014151 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014152 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014153 slicelength == PyUnicode_GET_LENGTH(self)) {
14154 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014155 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014156 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014157 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014158 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014159 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014160 src_kind = PyUnicode_KIND(self);
14161 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014162 if (!PyUnicode_IS_ASCII(self)) {
14163 kind_limit = kind_maxchar_limit(src_kind);
14164 max_char = 0;
14165 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14166 ch = PyUnicode_READ(src_kind, src_data, cur);
14167 if (ch > max_char) {
14168 max_char = ch;
14169 if (max_char >= kind_limit)
14170 break;
14171 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014172 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014173 }
Victor Stinner55c99112011-10-13 01:17:06 +020014174 else
14175 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014176 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014177 if (result == NULL)
14178 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014179 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014180 dest_data = PyUnicode_DATA(result);
14181
14182 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014183 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14184 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014185 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014186 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014187 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014188 } else {
14189 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14190 return NULL;
14191 }
14192}
14193
14194static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014195 (lenfunc)unicode_length, /* mp_length */
14196 (binaryfunc)unicode_subscript, /* mp_subscript */
14197 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014198};
14199
Guido van Rossumd57fd912000-03-10 22:53:23 +000014200
Guido van Rossumd57fd912000-03-10 22:53:23 +000014201/* Helpers for PyUnicode_Format() */
14202
Victor Stinnera47082312012-10-04 02:19:54 +020014203struct unicode_formatter_t {
14204 PyObject *args;
14205 int args_owned;
14206 Py_ssize_t arglen, argidx;
14207 PyObject *dict;
14208
14209 enum PyUnicode_Kind fmtkind;
14210 Py_ssize_t fmtcnt, fmtpos;
14211 void *fmtdata;
14212 PyObject *fmtstr;
14213
14214 _PyUnicodeWriter writer;
14215};
14216
14217struct unicode_format_arg_t {
14218 Py_UCS4 ch;
14219 int flags;
14220 Py_ssize_t width;
14221 int prec;
14222 int sign;
14223};
14224
Guido van Rossumd57fd912000-03-10 22:53:23 +000014225static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014226unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014227{
Victor Stinnera47082312012-10-04 02:19:54 +020014228 Py_ssize_t argidx = ctx->argidx;
14229
14230 if (argidx < ctx->arglen) {
14231 ctx->argidx++;
14232 if (ctx->arglen < 0)
14233 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014234 else
Victor Stinnera47082312012-10-04 02:19:54 +020014235 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014236 }
14237 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014238 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014239 return NULL;
14240}
14241
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014242/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014243
Victor Stinnera47082312012-10-04 02:19:54 +020014244/* Format a float into the writer if the writer is not NULL, or into *p_output
14245 otherwise.
14246
14247 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014248static int
Victor Stinnera47082312012-10-04 02:19:54 +020014249formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14250 PyObject **p_output,
14251 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014252{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014253 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014254 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014255 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014256 int prec;
14257 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014258
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259 x = PyFloat_AsDouble(v);
14260 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014261 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014262
Victor Stinnera47082312012-10-04 02:19:54 +020014263 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014264 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014265 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014266
Victor Stinnera47082312012-10-04 02:19:54 +020014267 if (arg->flags & F_ALT)
14268 dtoa_flags = Py_DTSF_ALT;
14269 else
14270 dtoa_flags = 0;
14271 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014272 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014273 return -1;
14274 len = strlen(p);
14275 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014276 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014277 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014278 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014279 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014280 }
14281 else
14282 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014283 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014284 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014285}
14286
Victor Stinnerd0880d52012-04-27 23:40:13 +020014287/* formatlong() emulates the format codes d, u, o, x and X, and
14288 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14289 * Python's regular ints.
14290 * Return value: a new PyUnicodeObject*, or NULL if error.
14291 * The output string is of the form
14292 * "-"? ("0x" | "0X")? digit+
14293 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14294 * set in flags. The case of hex digits will be correct,
14295 * There will be at least prec digits, zero-filled on the left if
14296 * necessary to get that many.
14297 * val object to be converted
14298 * flags bitmask of format flags; only F_ALT is looked at
14299 * prec minimum number of digits; 0-fill on left if needed
14300 * type a character in [duoxX]; u acts the same as d
14301 *
14302 * CAUTION: o, x and X conversions on regular ints can never
14303 * produce a '-' sign, but can for Python's unbounded ints.
14304 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014305PyObject *
14306_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014307{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014308 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014310 Py_ssize_t i;
14311 int sign; /* 1 if '-', else 0 */
14312 int len; /* number of characters */
14313 Py_ssize_t llen;
14314 int numdigits; /* len == numnondigits + numdigits */
14315 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014316
Victor Stinnerd0880d52012-04-27 23:40:13 +020014317 /* Avoid exceeding SSIZE_T_MAX */
14318 if (prec > INT_MAX-3) {
14319 PyErr_SetString(PyExc_OverflowError,
14320 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014321 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014322 }
14323
14324 assert(PyLong_Check(val));
14325
14326 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014327 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014328 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014329 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014330 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014331 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014332 /* int and int subclasses should print numerically when a numeric */
14333 /* format code is used (see issue18780) */
14334 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014335 break;
14336 case 'o':
14337 numnondigits = 2;
14338 result = PyNumber_ToBase(val, 8);
14339 break;
14340 case 'x':
14341 case 'X':
14342 numnondigits = 2;
14343 result = PyNumber_ToBase(val, 16);
14344 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014345 }
14346 if (!result)
14347 return NULL;
14348
14349 assert(unicode_modifiable(result));
14350 assert(PyUnicode_IS_READY(result));
14351 assert(PyUnicode_IS_ASCII(result));
14352
14353 /* To modify the string in-place, there can only be one reference. */
14354 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014355 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014356 PyErr_BadInternalCall();
14357 return NULL;
14358 }
14359 buf = PyUnicode_DATA(result);
14360 llen = PyUnicode_GET_LENGTH(result);
14361 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014362 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014363 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014364 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014365 return NULL;
14366 }
14367 len = (int)llen;
14368 sign = buf[0] == '-';
14369 numnondigits += sign;
14370 numdigits = len - numnondigits;
14371 assert(numdigits > 0);
14372
14373 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014374 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014375 (type == 'o' || type == 'x' || type == 'X'))) {
14376 assert(buf[sign] == '0');
14377 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14378 buf[sign+1] == 'o');
14379 numnondigits -= 2;
14380 buf += 2;
14381 len -= 2;
14382 if (sign)
14383 buf[0] = '-';
14384 assert(len == numnondigits + numdigits);
14385 assert(numdigits > 0);
14386 }
14387
14388 /* Fill with leading zeroes to meet minimum width. */
14389 if (prec > numdigits) {
14390 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14391 numnondigits + prec);
14392 char *b1;
14393 if (!r1) {
14394 Py_DECREF(result);
14395 return NULL;
14396 }
14397 b1 = PyBytes_AS_STRING(r1);
14398 for (i = 0; i < numnondigits; ++i)
14399 *b1++ = *buf++;
14400 for (i = 0; i < prec - numdigits; i++)
14401 *b1++ = '0';
14402 for (i = 0; i < numdigits; i++)
14403 *b1++ = *buf++;
14404 *b1 = '\0';
14405 Py_DECREF(result);
14406 result = r1;
14407 buf = PyBytes_AS_STRING(result);
14408 len = numnondigits + prec;
14409 }
14410
14411 /* Fix up case for hex conversions. */
14412 if (type == 'X') {
14413 /* Need to convert all lower case letters to upper case.
14414 and need to convert 0x to 0X (and -0x to -0X). */
14415 for (i = 0; i < len; i++)
14416 if (buf[i] >= 'a' && buf[i] <= 'x')
14417 buf[i] -= 'a'-'A';
14418 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014419 if (!PyUnicode_Check(result)
14420 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014421 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014422 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014423 Py_DECREF(result);
14424 result = unicode;
14425 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014426 else if (len != PyUnicode_GET_LENGTH(result)) {
14427 if (PyUnicode_Resize(&result, len) < 0)
14428 Py_CLEAR(result);
14429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014430 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014431}
14432
Ethan Furmandf3ed242014-01-05 06:50:30 -080014433/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014434 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014435 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014436 * -1 and raise an exception on error */
14437static int
Victor Stinnera47082312012-10-04 02:19:54 +020014438mainformatlong(PyObject *v,
14439 struct unicode_format_arg_t *arg,
14440 PyObject **p_output,
14441 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014442{
14443 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014444 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014445
14446 if (!PyNumber_Check(v))
14447 goto wrongtype;
14448
Ethan Furman9ab74802014-03-21 06:38:46 -070014449 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014450 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014451 if (type == 'o' || type == 'x' || type == 'X') {
14452 iobj = PyNumber_Index(v);
14453 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014454 if (PyErr_ExceptionMatches(PyExc_TypeError))
14455 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014456 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014457 }
14458 }
14459 else {
14460 iobj = PyNumber_Long(v);
14461 if (iobj == NULL ) {
14462 if (PyErr_ExceptionMatches(PyExc_TypeError))
14463 goto wrongtype;
14464 return -1;
14465 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014466 }
14467 assert(PyLong_Check(iobj));
14468 }
14469 else {
14470 iobj = v;
14471 Py_INCREF(iobj);
14472 }
14473
14474 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014475 && arg->width == -1 && arg->prec == -1
14476 && !(arg->flags & (F_SIGN | F_BLANK))
14477 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014478 {
14479 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014480 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014481 int base;
14482
Victor Stinnera47082312012-10-04 02:19:54 +020014483 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014484 {
14485 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014486 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014487 case 'd':
14488 case 'i':
14489 case 'u':
14490 base = 10;
14491 break;
14492 case 'o':
14493 base = 8;
14494 break;
14495 case 'x':
14496 case 'X':
14497 base = 16;
14498 break;
14499 }
14500
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014501 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14502 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014503 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014504 }
14505 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014506 return 1;
14507 }
14508
Ethan Furmanb95b5612015-01-23 20:05:18 -080014509 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014510 Py_DECREF(iobj);
14511 if (res == NULL)
14512 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014513 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014514 return 0;
14515
14516wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014517 switch(type)
14518 {
14519 case 'o':
14520 case 'x':
14521 case 'X':
14522 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014523 "%%%c format: an integer is required, "
14524 "not %.200s",
14525 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014526 break;
14527 default:
14528 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014529 "%%%c format: a number is required, "
14530 "not %.200s",
14531 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014532 break;
14533 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014534 return -1;
14535}
14536
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014537static Py_UCS4
14538formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014539{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014540 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014541 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014542 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014543 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014544 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014545 goto onError;
14546 }
14547 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014548 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014549 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014550 /* make sure number is a type of integer */
14551 if (!PyLong_Check(v)) {
14552 iobj = PyNumber_Index(v);
14553 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014554 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014555 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014556 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014557 Py_DECREF(iobj);
14558 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014559 else {
14560 x = PyLong_AsLong(v);
14561 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014562 if (x == -1 && PyErr_Occurred())
14563 goto onError;
14564
Victor Stinner8faf8212011-12-08 22:14:11 +010014565 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014566 PyErr_SetString(PyExc_OverflowError,
14567 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014568 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014569 }
14570
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014571 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014572 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014573
Benjamin Peterson29060642009-01-31 22:14:21 +000014574 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014575 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014576 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014577 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014578}
14579
Victor Stinnera47082312012-10-04 02:19:54 +020014580/* Parse options of an argument: flags, width, precision.
14581 Handle also "%(name)" syntax.
14582
14583 Return 0 if the argument has been formatted into arg->str.
14584 Return 1 if the argument has been written into ctx->writer,
14585 Raise an exception and return -1 on error. */
14586static int
14587unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14588 struct unicode_format_arg_t *arg)
14589{
14590#define FORMAT_READ(ctx) \
14591 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14592
14593 PyObject *v;
14594
Victor Stinnera47082312012-10-04 02:19:54 +020014595 if (arg->ch == '(') {
14596 /* Get argument value from a dictionary. Example: "%(name)s". */
14597 Py_ssize_t keystart;
14598 Py_ssize_t keylen;
14599 PyObject *key;
14600 int pcount = 1;
14601
14602 if (ctx->dict == NULL) {
14603 PyErr_SetString(PyExc_TypeError,
14604 "format requires a mapping");
14605 return -1;
14606 }
14607 ++ctx->fmtpos;
14608 --ctx->fmtcnt;
14609 keystart = ctx->fmtpos;
14610 /* Skip over balanced parentheses */
14611 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14612 arg->ch = FORMAT_READ(ctx);
14613 if (arg->ch == ')')
14614 --pcount;
14615 else if (arg->ch == '(')
14616 ++pcount;
14617 ctx->fmtpos++;
14618 }
14619 keylen = ctx->fmtpos - keystart - 1;
14620 if (ctx->fmtcnt < 0 || pcount > 0) {
14621 PyErr_SetString(PyExc_ValueError,
14622 "incomplete format key");
14623 return -1;
14624 }
14625 key = PyUnicode_Substring(ctx->fmtstr,
14626 keystart, keystart + keylen);
14627 if (key == NULL)
14628 return -1;
14629 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014630 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014631 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014632 }
14633 ctx->args = PyObject_GetItem(ctx->dict, key);
14634 Py_DECREF(key);
14635 if (ctx->args == NULL)
14636 return -1;
14637 ctx->args_owned = 1;
14638 ctx->arglen = -1;
14639 ctx->argidx = -2;
14640 }
14641
14642 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014643 while (--ctx->fmtcnt >= 0) {
14644 arg->ch = FORMAT_READ(ctx);
14645 ctx->fmtpos++;
14646 switch (arg->ch) {
14647 case '-': arg->flags |= F_LJUST; continue;
14648 case '+': arg->flags |= F_SIGN; continue;
14649 case ' ': arg->flags |= F_BLANK; continue;
14650 case '#': arg->flags |= F_ALT; continue;
14651 case '0': arg->flags |= F_ZERO; continue;
14652 }
14653 break;
14654 }
14655
14656 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014657 if (arg->ch == '*') {
14658 v = unicode_format_getnextarg(ctx);
14659 if (v == NULL)
14660 return -1;
14661 if (!PyLong_Check(v)) {
14662 PyErr_SetString(PyExc_TypeError,
14663 "* wants int");
14664 return -1;
14665 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014666 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014667 if (arg->width == -1 && PyErr_Occurred())
14668 return -1;
14669 if (arg->width < 0) {
14670 arg->flags |= F_LJUST;
14671 arg->width = -arg->width;
14672 }
14673 if (--ctx->fmtcnt >= 0) {
14674 arg->ch = FORMAT_READ(ctx);
14675 ctx->fmtpos++;
14676 }
14677 }
14678 else if (arg->ch >= '0' && arg->ch <= '9') {
14679 arg->width = arg->ch - '0';
14680 while (--ctx->fmtcnt >= 0) {
14681 arg->ch = FORMAT_READ(ctx);
14682 ctx->fmtpos++;
14683 if (arg->ch < '0' || arg->ch > '9')
14684 break;
14685 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14686 mixing signed and unsigned comparison. Since arg->ch is between
14687 '0' and '9', casting to int is safe. */
14688 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14689 PyErr_SetString(PyExc_ValueError,
14690 "width too big");
14691 return -1;
14692 }
14693 arg->width = arg->width*10 + (arg->ch - '0');
14694 }
14695 }
14696
14697 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014698 if (arg->ch == '.') {
14699 arg->prec = 0;
14700 if (--ctx->fmtcnt >= 0) {
14701 arg->ch = FORMAT_READ(ctx);
14702 ctx->fmtpos++;
14703 }
14704 if (arg->ch == '*') {
14705 v = unicode_format_getnextarg(ctx);
14706 if (v == NULL)
14707 return -1;
14708 if (!PyLong_Check(v)) {
14709 PyErr_SetString(PyExc_TypeError,
14710 "* wants int");
14711 return -1;
14712 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014713 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014714 if (arg->prec == -1 && PyErr_Occurred())
14715 return -1;
14716 if (arg->prec < 0)
14717 arg->prec = 0;
14718 if (--ctx->fmtcnt >= 0) {
14719 arg->ch = FORMAT_READ(ctx);
14720 ctx->fmtpos++;
14721 }
14722 }
14723 else if (arg->ch >= '0' && arg->ch <= '9') {
14724 arg->prec = arg->ch - '0';
14725 while (--ctx->fmtcnt >= 0) {
14726 arg->ch = FORMAT_READ(ctx);
14727 ctx->fmtpos++;
14728 if (arg->ch < '0' || arg->ch > '9')
14729 break;
14730 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14731 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014732 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014733 return -1;
14734 }
14735 arg->prec = arg->prec*10 + (arg->ch - '0');
14736 }
14737 }
14738 }
14739
14740 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14741 if (ctx->fmtcnt >= 0) {
14742 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14743 if (--ctx->fmtcnt >= 0) {
14744 arg->ch = FORMAT_READ(ctx);
14745 ctx->fmtpos++;
14746 }
14747 }
14748 }
14749 if (ctx->fmtcnt < 0) {
14750 PyErr_SetString(PyExc_ValueError,
14751 "incomplete format");
14752 return -1;
14753 }
14754 return 0;
14755
14756#undef FORMAT_READ
14757}
14758
14759/* Format one argument. Supported conversion specifiers:
14760
14761 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014762 - "i", "d", "u": int or float
14763 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014764 - "e", "E", "f", "F", "g", "G": float
14765 - "c": int or str (1 character)
14766
Victor Stinner8dbd4212012-12-04 09:30:24 +010014767 When possible, the output is written directly into the Unicode writer
14768 (ctx->writer). A string is created when padding is required.
14769
Victor Stinnera47082312012-10-04 02:19:54 +020014770 Return 0 if the argument has been formatted into *p_str,
14771 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014772 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014773static int
14774unicode_format_arg_format(struct unicode_formatter_t *ctx,
14775 struct unicode_format_arg_t *arg,
14776 PyObject **p_str)
14777{
14778 PyObject *v;
14779 _PyUnicodeWriter *writer = &ctx->writer;
14780
14781 if (ctx->fmtcnt == 0)
14782 ctx->writer.overallocate = 0;
14783
Victor Stinnera47082312012-10-04 02:19:54 +020014784 v = unicode_format_getnextarg(ctx);
14785 if (v == NULL)
14786 return -1;
14787
Victor Stinnera47082312012-10-04 02:19:54 +020014788
14789 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014790 case 's':
14791 case 'r':
14792 case 'a':
14793 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14794 /* Fast path */
14795 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14796 return -1;
14797 return 1;
14798 }
14799
14800 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14801 *p_str = v;
14802 Py_INCREF(*p_str);
14803 }
14804 else {
14805 if (arg->ch == 's')
14806 *p_str = PyObject_Str(v);
14807 else if (arg->ch == 'r')
14808 *p_str = PyObject_Repr(v);
14809 else
14810 *p_str = PyObject_ASCII(v);
14811 }
14812 break;
14813
14814 case 'i':
14815 case 'd':
14816 case 'u':
14817 case 'o':
14818 case 'x':
14819 case 'X':
14820 {
14821 int ret = mainformatlong(v, arg, p_str, writer);
14822 if (ret != 0)
14823 return ret;
14824 arg->sign = 1;
14825 break;
14826 }
14827
14828 case 'e':
14829 case 'E':
14830 case 'f':
14831 case 'F':
14832 case 'g':
14833 case 'G':
14834 if (arg->width == -1 && arg->prec == -1
14835 && !(arg->flags & (F_SIGN | F_BLANK)))
14836 {
14837 /* Fast path */
14838 if (formatfloat(v, arg, NULL, writer) == -1)
14839 return -1;
14840 return 1;
14841 }
14842
14843 arg->sign = 1;
14844 if (formatfloat(v, arg, p_str, NULL) == -1)
14845 return -1;
14846 break;
14847
14848 case 'c':
14849 {
14850 Py_UCS4 ch = formatchar(v);
14851 if (ch == (Py_UCS4) -1)
14852 return -1;
14853 if (arg->width == -1 && arg->prec == -1) {
14854 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014855 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014856 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014857 return 1;
14858 }
14859 *p_str = PyUnicode_FromOrdinal(ch);
14860 break;
14861 }
14862
14863 default:
14864 PyErr_Format(PyExc_ValueError,
14865 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014866 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014867 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14868 (int)arg->ch,
14869 ctx->fmtpos - 1);
14870 return -1;
14871 }
14872 if (*p_str == NULL)
14873 return -1;
14874 assert (PyUnicode_Check(*p_str));
14875 return 0;
14876}
14877
14878static int
14879unicode_format_arg_output(struct unicode_formatter_t *ctx,
14880 struct unicode_format_arg_t *arg,
14881 PyObject *str)
14882{
14883 Py_ssize_t len;
14884 enum PyUnicode_Kind kind;
14885 void *pbuf;
14886 Py_ssize_t pindex;
14887 Py_UCS4 signchar;
14888 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014889 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014890 Py_ssize_t sublen;
14891 _PyUnicodeWriter *writer = &ctx->writer;
14892 Py_UCS4 fill;
14893
14894 fill = ' ';
14895 if (arg->sign && arg->flags & F_ZERO)
14896 fill = '0';
14897
14898 if (PyUnicode_READY(str) == -1)
14899 return -1;
14900
14901 len = PyUnicode_GET_LENGTH(str);
14902 if ((arg->width == -1 || arg->width <= len)
14903 && (arg->prec == -1 || arg->prec >= len)
14904 && !(arg->flags & (F_SIGN | F_BLANK)))
14905 {
14906 /* Fast path */
14907 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14908 return -1;
14909 return 0;
14910 }
14911
14912 /* Truncate the string for "s", "r" and "a" formats
14913 if the precision is set */
14914 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14915 if (arg->prec >= 0 && len > arg->prec)
14916 len = arg->prec;
14917 }
14918
14919 /* Adjust sign and width */
14920 kind = PyUnicode_KIND(str);
14921 pbuf = PyUnicode_DATA(str);
14922 pindex = 0;
14923 signchar = '\0';
14924 if (arg->sign) {
14925 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14926 if (ch == '-' || ch == '+') {
14927 signchar = ch;
14928 len--;
14929 pindex++;
14930 }
14931 else if (arg->flags & F_SIGN)
14932 signchar = '+';
14933 else if (arg->flags & F_BLANK)
14934 signchar = ' ';
14935 else
14936 arg->sign = 0;
14937 }
14938 if (arg->width < len)
14939 arg->width = len;
14940
14941 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014942 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014943 if (!(arg->flags & F_LJUST)) {
14944 if (arg->sign) {
14945 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014946 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014947 }
14948 else {
14949 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014950 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014951 }
14952 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014953 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14954 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014955 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014956 }
14957
Victor Stinnera47082312012-10-04 02:19:54 +020014958 buflen = arg->width;
14959 if (arg->sign && len == arg->width)
14960 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014961 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014962 return -1;
14963
14964 /* Write the sign if needed */
14965 if (arg->sign) {
14966 if (fill != ' ') {
14967 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14968 writer->pos += 1;
14969 }
14970 if (arg->width > len)
14971 arg->width--;
14972 }
14973
14974 /* Write the numeric prefix for "x", "X" and "o" formats
14975 if the alternate form is used.
14976 For example, write "0x" for the "%#x" format. */
14977 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14978 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14979 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14980 if (fill != ' ') {
14981 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14982 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14983 writer->pos += 2;
14984 pindex += 2;
14985 }
14986 arg->width -= 2;
14987 if (arg->width < 0)
14988 arg->width = 0;
14989 len -= 2;
14990 }
14991
14992 /* Pad left with the fill character if needed */
14993 if (arg->width > len && !(arg->flags & F_LJUST)) {
14994 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014995 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014996 writer->pos += sublen;
14997 arg->width = len;
14998 }
14999
15000 /* If padding with spaces: write sign if needed and/or numeric prefix if
15001 the alternate form is used */
15002 if (fill == ' ') {
15003 if (arg->sign) {
15004 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15005 writer->pos += 1;
15006 }
15007 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15008 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15009 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15010 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15011 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15012 writer->pos += 2;
15013 pindex += 2;
15014 }
15015 }
15016
15017 /* Write characters */
15018 if (len) {
15019 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15020 str, pindex, len);
15021 writer->pos += len;
15022 }
15023
15024 /* Pad right with the fill character if needed */
15025 if (arg->width > len) {
15026 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015027 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015028 writer->pos += sublen;
15029 }
15030 return 0;
15031}
15032
15033/* Helper of PyUnicode_Format(): format one arg.
15034 Return 0 on success, raise an exception and return -1 on error. */
15035static int
15036unicode_format_arg(struct unicode_formatter_t *ctx)
15037{
15038 struct unicode_format_arg_t arg;
15039 PyObject *str;
15040 int ret;
15041
Victor Stinner8dbd4212012-12-04 09:30:24 +010015042 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015043 if (arg.ch == '%') {
15044 ctx->fmtpos++;
15045 ctx->fmtcnt--;
15046 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15047 return -1;
15048 return 0;
15049 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015050 arg.flags = 0;
15051 arg.width = -1;
15052 arg.prec = -1;
15053 arg.sign = 0;
15054 str = NULL;
15055
Victor Stinnera47082312012-10-04 02:19:54 +020015056 ret = unicode_format_arg_parse(ctx, &arg);
15057 if (ret == -1)
15058 return -1;
15059
15060 ret = unicode_format_arg_format(ctx, &arg, &str);
15061 if (ret == -1)
15062 return -1;
15063
15064 if (ret != 1) {
15065 ret = unicode_format_arg_output(ctx, &arg, str);
15066 Py_DECREF(str);
15067 if (ret == -1)
15068 return -1;
15069 }
15070
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015071 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015072 PyErr_SetString(PyExc_TypeError,
15073 "not all arguments converted during string formatting");
15074 return -1;
15075 }
15076 return 0;
15077}
15078
Alexander Belopolsky40018472011-02-26 01:02:56 +000015079PyObject *
15080PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015081{
Victor Stinnera47082312012-10-04 02:19:54 +020015082 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015083
Guido van Rossumd57fd912000-03-10 22:53:23 +000015084 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015085 PyErr_BadInternalCall();
15086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015087 }
Victor Stinnera47082312012-10-04 02:19:54 +020015088
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015089 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015090 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015091
15092 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015093 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15094 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15095 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15096 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015097
Victor Stinner8f674cc2013-04-17 23:02:17 +020015098 _PyUnicodeWriter_Init(&ctx.writer);
15099 ctx.writer.min_length = ctx.fmtcnt + 100;
15100 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015101
Guido van Rossumd57fd912000-03-10 22:53:23 +000015102 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015103 ctx.arglen = PyTuple_Size(args);
15104 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015105 }
15106 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015107 ctx.arglen = -1;
15108 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015109 }
Victor Stinnera47082312012-10-04 02:19:54 +020015110 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015111 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015112 ctx.dict = args;
15113 else
15114 ctx.dict = NULL;
15115 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015116
Victor Stinnera47082312012-10-04 02:19:54 +020015117 while (--ctx.fmtcnt >= 0) {
15118 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015119 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015120
15121 nonfmtpos = ctx.fmtpos++;
15122 while (ctx.fmtcnt >= 0 &&
15123 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15124 ctx.fmtpos++;
15125 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015126 }
Victor Stinnera47082312012-10-04 02:19:54 +020015127 if (ctx.fmtcnt < 0) {
15128 ctx.fmtpos--;
15129 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015130 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015131
Victor Stinnercfc4c132013-04-03 01:48:39 +020015132 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15133 nonfmtpos, ctx.fmtpos) < 0)
15134 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 }
15136 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015137 ctx.fmtpos++;
15138 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015139 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015140 }
15141 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015142
Victor Stinnera47082312012-10-04 02:19:54 +020015143 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015144 PyErr_SetString(PyExc_TypeError,
15145 "not all arguments converted during string formatting");
15146 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015147 }
15148
Victor Stinnera47082312012-10-04 02:19:54 +020015149 if (ctx.args_owned) {
15150 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015151 }
Victor Stinnera47082312012-10-04 02:19:54 +020015152 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015153
Benjamin Peterson29060642009-01-31 22:14:21 +000015154 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015155 _PyUnicodeWriter_Dealloc(&ctx.writer);
15156 if (ctx.args_owned) {
15157 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015158 }
15159 return NULL;
15160}
15161
Jeremy Hylton938ace62002-07-17 16:30:39 +000015162static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015163unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15164
Tim Peters6d6c1a32001-08-02 04:15:00 +000015165static PyObject *
15166unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15167{
Benjamin Peterson29060642009-01-31 22:14:21 +000015168 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015169 static char *kwlist[] = {"object", "encoding", "errors", 0};
15170 char *encoding = NULL;
15171 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015172
Benjamin Peterson14339b62009-01-31 16:36:08 +000015173 if (type != &PyUnicode_Type)
15174 return unicode_subtype_new(type, args, kwds);
15175 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015176 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015177 return NULL;
15178 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015179 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 if (encoding == NULL && errors == NULL)
15181 return PyObject_Str(x);
15182 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015183 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015184}
15185
Guido van Rossume023fe02001-08-30 03:12:59 +000015186static PyObject *
15187unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15188{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015189 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015190 Py_ssize_t length, char_size;
15191 int share_wstr, share_utf8;
15192 unsigned int kind;
15193 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015194
Benjamin Peterson14339b62009-01-31 16:36:08 +000015195 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015196
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015197 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015198 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015199 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015200 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015201 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015202 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015203 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015204 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015205
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015206 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015207 if (self == NULL) {
15208 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015209 return NULL;
15210 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015211 kind = PyUnicode_KIND(unicode);
15212 length = PyUnicode_GET_LENGTH(unicode);
15213
15214 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015215#ifdef Py_DEBUG
15216 _PyUnicode_HASH(self) = -1;
15217#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015218 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015219#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015220 _PyUnicode_STATE(self).interned = 0;
15221 _PyUnicode_STATE(self).kind = kind;
15222 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015223 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015224 _PyUnicode_STATE(self).ready = 1;
15225 _PyUnicode_WSTR(self) = NULL;
15226 _PyUnicode_UTF8_LENGTH(self) = 0;
15227 _PyUnicode_UTF8(self) = NULL;
15228 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015229 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015230
15231 share_utf8 = 0;
15232 share_wstr = 0;
15233 if (kind == PyUnicode_1BYTE_KIND) {
15234 char_size = 1;
15235 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15236 share_utf8 = 1;
15237 }
15238 else if (kind == PyUnicode_2BYTE_KIND) {
15239 char_size = 2;
15240 if (sizeof(wchar_t) == 2)
15241 share_wstr = 1;
15242 }
15243 else {
15244 assert(kind == PyUnicode_4BYTE_KIND);
15245 char_size = 4;
15246 if (sizeof(wchar_t) == 4)
15247 share_wstr = 1;
15248 }
15249
15250 /* Ensure we won't overflow the length. */
15251 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15252 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015253 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015254 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015255 data = PyObject_MALLOC((length + 1) * char_size);
15256 if (data == NULL) {
15257 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015258 goto onError;
15259 }
15260
Victor Stinnerc3c74152011-10-02 20:39:55 +020015261 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015262 if (share_utf8) {
15263 _PyUnicode_UTF8_LENGTH(self) = length;
15264 _PyUnicode_UTF8(self) = data;
15265 }
15266 if (share_wstr) {
15267 _PyUnicode_WSTR_LENGTH(self) = length;
15268 _PyUnicode_WSTR(self) = (wchar_t *)data;
15269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015270
Christian Heimesf051e432016-09-13 20:22:02 +020015271 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015272 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015273 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015274#ifdef Py_DEBUG
15275 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15276#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015277 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015278 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015279
15280onError:
15281 Py_DECREF(unicode);
15282 Py_DECREF(self);
15283 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015284}
15285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015286PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015287"str(object='') -> str\n\
15288str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015289\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015290Create a new string object from the given object. If encoding or\n\
15291errors is specified, then the object must expose a data buffer\n\
15292that will be decoded using the given encoding and error handler.\n\
15293Otherwise, returns the result of object.__str__() (if defined)\n\
15294or repr(object).\n\
15295encoding defaults to sys.getdefaultencoding().\n\
15296errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015297
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015298static PyObject *unicode_iter(PyObject *seq);
15299
Guido van Rossumd57fd912000-03-10 22:53:23 +000015300PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015301 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015302 "str", /* tp_name */
15303 sizeof(PyUnicodeObject), /* tp_basicsize */
15304 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015305 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015306 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015307 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015308 0, /* tp_getattr */
15309 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015310 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015311 unicode_repr, /* tp_repr */
15312 &unicode_as_number, /* tp_as_number */
15313 &unicode_as_sequence, /* tp_as_sequence */
15314 &unicode_as_mapping, /* tp_as_mapping */
15315 (hashfunc) unicode_hash, /* tp_hash*/
15316 0, /* tp_call*/
15317 (reprfunc) unicode_str, /* tp_str */
15318 PyObject_GenericGetAttr, /* tp_getattro */
15319 0, /* tp_setattro */
15320 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015321 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015322 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15323 unicode_doc, /* tp_doc */
15324 0, /* tp_traverse */
15325 0, /* tp_clear */
15326 PyUnicode_RichCompare, /* tp_richcompare */
15327 0, /* tp_weaklistoffset */
15328 unicode_iter, /* tp_iter */
15329 0, /* tp_iternext */
15330 unicode_methods, /* tp_methods */
15331 0, /* tp_members */
15332 0, /* tp_getset */
15333 &PyBaseObject_Type, /* tp_base */
15334 0, /* tp_dict */
15335 0, /* tp_descr_get */
15336 0, /* tp_descr_set */
15337 0, /* tp_dictoffset */
15338 0, /* tp_init */
15339 0, /* tp_alloc */
15340 unicode_new, /* tp_new */
15341 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015342};
15343
15344/* Initialize the Unicode implementation */
15345
Victor Stinner331a6a52019-05-27 16:39:22 +020015346PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015347_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015348{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015349 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015350 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015351 0x000A, /* LINE FEED */
15352 0x000D, /* CARRIAGE RETURN */
15353 0x001C, /* FILE SEPARATOR */
15354 0x001D, /* GROUP SEPARATOR */
15355 0x001E, /* RECORD SEPARATOR */
15356 0x0085, /* NEXT LINE */
15357 0x2028, /* LINE SEPARATOR */
15358 0x2029, /* PARAGRAPH SEPARATOR */
15359 };
15360
Fred Drakee4315f52000-05-09 19:53:39 +000015361 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015362 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015363 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015364 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015365 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015366 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015367
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015368 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015369 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015370 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015371
15372 /* initialize the linebreak bloom filter */
15373 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015374 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015375 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015376
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015377 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015378 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015379 }
15380 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015381 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015382 }
15383 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015384 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015385 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015386 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015387}
15388
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015389
Walter Dörwald16807132007-05-25 13:52:07 +000015390void
15391PyUnicode_InternInPlace(PyObject **p)
15392{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015393 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015394 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015395#ifdef Py_DEBUG
15396 assert(s != NULL);
15397 assert(_PyUnicode_CHECK(s));
15398#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015399 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015400 return;
15401#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 /* If it's a subclass, we don't really know what putting
15403 it in the interned dict might do. */
15404 if (!PyUnicode_CheckExact(s))
15405 return;
15406 if (PyUnicode_CHECK_INTERNED(s))
15407 return;
15408 if (interned == NULL) {
15409 interned = PyDict_New();
15410 if (interned == NULL) {
15411 PyErr_Clear(); /* Don't leave an exception */
15412 return;
15413 }
15414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015416 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015418 if (t == NULL) {
15419 PyErr_Clear();
15420 return;
15421 }
15422 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015423 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015424 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015425 return;
15426 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 /* The two references in interned are not counted by refcnt.
15428 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015429 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015430 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015431}
15432
15433void
15434PyUnicode_InternImmortal(PyObject **p)
15435{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015436 PyUnicode_InternInPlace(p);
15437 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015438 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 Py_INCREF(*p);
15440 }
Walter Dörwald16807132007-05-25 13:52:07 +000015441}
15442
15443PyObject *
15444PyUnicode_InternFromString(const char *cp)
15445{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 PyObject *s = PyUnicode_FromString(cp);
15447 if (s == NULL)
15448 return NULL;
15449 PyUnicode_InternInPlace(&s);
15450 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015451}
15452
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015453
15454#if defined(WITH_VALGRIND) || defined(__INSURE__)
15455static void
15456unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015457{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015458 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015460 }
15461 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 if (keys == NULL || !PyList_Check(keys)) {
15463 PyErr_Clear();
15464 return;
15465 }
Walter Dörwald16807132007-05-25 13:52:07 +000015466
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015467 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 detector, interned unicode strings are not forcibly deallocated;
15469 rather, we give them their stolen references back, and then clear
15470 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015471
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015472 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015473#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015474 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015475 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015476
15477 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015478#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015479 for (Py_ssize_t i = 0; i < n; i++) {
15480 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015481 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015482 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015484 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015485 case SSTATE_INTERNED_IMMORTAL:
15486 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015487#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015488 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015489#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015490 break;
15491 case SSTATE_INTERNED_MORTAL:
15492 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015493#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015494 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015495#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015497 case SSTATE_NOT_INTERNED:
15498 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015499 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015500 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015502 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015504#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 fprintf(stderr, "total size of all interned strings: "
15506 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15507 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015508#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015509 Py_DECREF(keys);
15510 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015511 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015512}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015513#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015514
15515
15516/********************* Unicode Iterator **************************/
15517
15518typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015519 PyObject_HEAD
15520 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015521 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015522} unicodeiterobject;
15523
15524static void
15525unicodeiter_dealloc(unicodeiterobject *it)
15526{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015527 _PyObject_GC_UNTRACK(it);
15528 Py_XDECREF(it->it_seq);
15529 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015530}
15531
15532static int
15533unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15534{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015535 Py_VISIT(it->it_seq);
15536 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015537}
15538
15539static PyObject *
15540unicodeiter_next(unicodeiterobject *it)
15541{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015542 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015543
Benjamin Peterson14339b62009-01-31 16:36:08 +000015544 assert(it != NULL);
15545 seq = it->it_seq;
15546 if (seq == NULL)
15547 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015548 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015550 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15551 int kind = PyUnicode_KIND(seq);
15552 void *data = PyUnicode_DATA(seq);
15553 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15554 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015555 if (item != NULL)
15556 ++it->it_index;
15557 return item;
15558 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015559
Benjamin Peterson14339b62009-01-31 16:36:08 +000015560 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015561 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015562 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015563}
15564
15565static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015566unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015567{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015568 Py_ssize_t len = 0;
15569 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015570 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015571 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015572}
15573
15574PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15575
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015576static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015577unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015578{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015579 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015580 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015581 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015582 it->it_seq, it->it_index);
15583 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015584 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015585 if (u == NULL)
15586 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015587 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015588 }
15589}
15590
15591PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15592
15593static PyObject *
15594unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15595{
15596 Py_ssize_t index = PyLong_AsSsize_t(state);
15597 if (index == -1 && PyErr_Occurred())
15598 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015599 if (it->it_seq != NULL) {
15600 if (index < 0)
15601 index = 0;
15602 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15603 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15604 it->it_index = index;
15605 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015606 Py_RETURN_NONE;
15607}
15608
15609PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15610
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015611static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015612 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015613 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015614 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15615 reduce_doc},
15616 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15617 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015618 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015619};
15620
15621PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015622 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15623 "str_iterator", /* tp_name */
15624 sizeof(unicodeiterobject), /* tp_basicsize */
15625 0, /* tp_itemsize */
15626 /* methods */
15627 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015628 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015629 0, /* tp_getattr */
15630 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015631 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015632 0, /* tp_repr */
15633 0, /* tp_as_number */
15634 0, /* tp_as_sequence */
15635 0, /* tp_as_mapping */
15636 0, /* tp_hash */
15637 0, /* tp_call */
15638 0, /* tp_str */
15639 PyObject_GenericGetAttr, /* tp_getattro */
15640 0, /* tp_setattro */
15641 0, /* tp_as_buffer */
15642 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15643 0, /* tp_doc */
15644 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15645 0, /* tp_clear */
15646 0, /* tp_richcompare */
15647 0, /* tp_weaklistoffset */
15648 PyObject_SelfIter, /* tp_iter */
15649 (iternextfunc)unicodeiter_next, /* tp_iternext */
15650 unicodeiter_methods, /* tp_methods */
15651 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015652};
15653
15654static PyObject *
15655unicode_iter(PyObject *seq)
15656{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015657 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015658
Benjamin Peterson14339b62009-01-31 16:36:08 +000015659 if (!PyUnicode_Check(seq)) {
15660 PyErr_BadInternalCall();
15661 return NULL;
15662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015663 if (PyUnicode_READY(seq) == -1)
15664 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015665 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15666 if (it == NULL)
15667 return NULL;
15668 it->it_index = 0;
15669 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015670 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015671 _PyObject_GC_TRACK(it);
15672 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015673}
15674
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015675
15676size_t
15677Py_UNICODE_strlen(const Py_UNICODE *u)
15678{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015679 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015680}
15681
15682Py_UNICODE*
15683Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15684{
15685 Py_UNICODE *u = s1;
15686 while ((*u++ = *s2++));
15687 return s1;
15688}
15689
15690Py_UNICODE*
15691Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15692{
15693 Py_UNICODE *u = s1;
15694 while ((*u++ = *s2++))
15695 if (n-- == 0)
15696 break;
15697 return s1;
15698}
15699
15700Py_UNICODE*
15701Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15702{
15703 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015704 u1 += wcslen(u1);
15705 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015706 return s1;
15707}
15708
15709int
15710Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15711{
15712 while (*s1 && *s2 && *s1 == *s2)
15713 s1++, s2++;
15714 if (*s1 && *s2)
15715 return (*s1 < *s2) ? -1 : +1;
15716 if (*s1)
15717 return 1;
15718 if (*s2)
15719 return -1;
15720 return 0;
15721}
15722
15723int
15724Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15725{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015726 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015727 for (; n != 0; n--) {
15728 u1 = *s1;
15729 u2 = *s2;
15730 if (u1 != u2)
15731 return (u1 < u2) ? -1 : +1;
15732 if (u1 == '\0')
15733 return 0;
15734 s1++;
15735 s2++;
15736 }
15737 return 0;
15738}
15739
15740Py_UNICODE*
15741Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15742{
15743 const Py_UNICODE *p;
15744 for (p = s; *p; p++)
15745 if (*p == c)
15746 return (Py_UNICODE*)p;
15747 return NULL;
15748}
15749
15750Py_UNICODE*
15751Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15752{
15753 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015754 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015755 while (p != s) {
15756 p--;
15757 if (*p == c)
15758 return (Py_UNICODE*)p;
15759 }
15760 return NULL;
15761}
Victor Stinner331ea922010-08-10 16:37:20 +000015762
Victor Stinner71133ff2010-09-01 23:43:53 +000015763Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015764PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015765{
Victor Stinner577db2c2011-10-11 22:12:48 +020015766 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015767 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015769 if (!PyUnicode_Check(unicode)) {
15770 PyErr_BadArgument();
15771 return NULL;
15772 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015773 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015774 if (u == NULL)
15775 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015776 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015777 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015778 PyErr_NoMemory();
15779 return NULL;
15780 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015781 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015782 size *= sizeof(Py_UNICODE);
15783 copy = PyMem_Malloc(size);
15784 if (copy == NULL) {
15785 PyErr_NoMemory();
15786 return NULL;
15787 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015788 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015789 return copy;
15790}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015791
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015792
Victor Stinner709d23d2019-05-02 14:56:30 -040015793static int
15794encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015795{
Victor Stinner709d23d2019-05-02 14:56:30 -040015796 int res;
15797 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15798 if (res == -2) {
15799 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15800 return -1;
15801 }
15802 if (res < 0) {
15803 PyErr_NoMemory();
15804 return -1;
15805 }
15806 return 0;
15807}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015808
Victor Stinner709d23d2019-05-02 14:56:30 -040015809
15810static int
15811config_get_codec_name(wchar_t **config_encoding)
15812{
15813 char *encoding;
15814 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15815 return -1;
15816 }
15817
15818 PyObject *name_obj = NULL;
15819 PyObject *codec = _PyCodec_Lookup(encoding);
15820 PyMem_RawFree(encoding);
15821
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015822 if (!codec)
15823 goto error;
15824
15825 name_obj = PyObject_GetAttrString(codec, "name");
15826 Py_CLEAR(codec);
15827 if (!name_obj) {
15828 goto error;
15829 }
15830
Victor Stinner709d23d2019-05-02 14:56:30 -040015831 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15832 Py_DECREF(name_obj);
15833 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015834 goto error;
15835 }
15836
Victor Stinner709d23d2019-05-02 14:56:30 -040015837 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15838 if (raw_wname == NULL) {
15839 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015840 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015841 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015842 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015843
15844 PyMem_RawFree(*config_encoding);
15845 *config_encoding = raw_wname;
15846
15847 PyMem_Free(wname);
15848 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015849
15850error:
15851 Py_XDECREF(codec);
15852 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015853 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015854}
15855
15856
Victor Stinner331a6a52019-05-27 16:39:22 +020015857static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015858init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015859{
Victor Stinner709d23d2019-05-02 14:56:30 -040015860 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015861 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015862 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015863 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015864 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015865 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015866 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015867}
15868
15869
Victor Stinner709d23d2019-05-02 14:56:30 -040015870static int
15871init_fs_codec(PyInterpreterState *interp)
15872{
Victor Stinner331a6a52019-05-27 16:39:22 +020015873 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015874
15875 _Py_error_handler error_handler;
15876 error_handler = get_error_handler_wide(config->filesystem_errors);
15877 if (error_handler == _Py_ERROR_UNKNOWN) {
15878 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15879 return -1;
15880 }
15881
15882 char *encoding, *errors;
15883 if (encode_wstr_utf8(config->filesystem_encoding,
15884 &encoding,
15885 "filesystem_encoding") < 0) {
15886 return -1;
15887 }
15888
15889 if (encode_wstr_utf8(config->filesystem_errors,
15890 &errors,
15891 "filesystem_errors") < 0) {
15892 PyMem_RawFree(encoding);
15893 return -1;
15894 }
15895
15896 PyMem_RawFree(interp->fs_codec.encoding);
15897 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015898 /* encoding has been normalized by init_fs_encoding() */
15899 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015900 PyMem_RawFree(interp->fs_codec.errors);
15901 interp->fs_codec.errors = errors;
15902 interp->fs_codec.error_handler = error_handler;
15903
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015904#ifdef _Py_FORCE_UTF8_FS_ENCODING
15905 assert(interp->fs_codec.utf8 == 1);
15906#endif
15907
Victor Stinner709d23d2019-05-02 14:56:30 -040015908 /* At this point, PyUnicode_EncodeFSDefault() and
15909 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15910 the C implementation of the filesystem encoding. */
15911
15912 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15913 global configuration variables. */
15914 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15915 interp->fs_codec.errors) < 0) {
15916 PyErr_NoMemory();
15917 return -1;
15918 }
15919 return 0;
15920}
15921
15922
Victor Stinner331a6a52019-05-27 16:39:22 +020015923static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015924init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015925{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015926 PyInterpreterState *interp = tstate->interp;
15927
Victor Stinner709d23d2019-05-02 14:56:30 -040015928 /* Update the filesystem encoding to the normalized Python codec name.
15929 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15930 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015931 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015932 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015933 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015934 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015935 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015936 }
15937
Victor Stinner709d23d2019-05-02 14:56:30 -040015938 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015939 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015940 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015941 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015942}
15943
15944
Victor Stinner331a6a52019-05-27 16:39:22 +020015945PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015946_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015947{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015948 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015949 if (_PyStatus_EXCEPTION(status)) {
15950 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015951 }
15952
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015953 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015954}
15955
15956
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015957static void
15958_PyUnicode_FiniEncodings(PyThreadState *tstate)
15959{
15960 PyInterpreterState *interp = tstate->interp;
15961 PyMem_RawFree(interp->fs_codec.encoding);
15962 interp->fs_codec.encoding = NULL;
15963 interp->fs_codec.utf8 = 0;
15964 PyMem_RawFree(interp->fs_codec.errors);
15965 interp->fs_codec.errors = NULL;
15966 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
15967}
15968
15969
Victor Stinner709d23d2019-05-02 14:56:30 -040015970#ifdef MS_WINDOWS
15971int
15972_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15973{
15974 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015975 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015976
15977 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15978 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15979 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15980 if (encoding == NULL || errors == NULL) {
15981 PyMem_RawFree(encoding);
15982 PyMem_RawFree(errors);
15983 PyErr_NoMemory();
15984 return -1;
15985 }
15986
15987 PyMem_RawFree(config->filesystem_encoding);
15988 config->filesystem_encoding = encoding;
15989 PyMem_RawFree(config->filesystem_errors);
15990 config->filesystem_errors = errors;
15991
15992 return init_fs_codec(interp);
15993}
15994#endif
15995
15996
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015997void
Victor Stinner3d483342019-11-22 12:27:50 +010015998_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015999{
Victor Stinner3d483342019-11-22 12:27:50 +010016000 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016001#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016002 /* Insure++ is a memory analysis tool that aids in discovering
16003 * memory leaks and other memory problems. On Python exit, the
16004 * interned string dictionaries are flagged as being in use at exit
16005 * (which it is). Under normal circumstances, this is fine because
16006 * the memory will be automatically reclaimed by the system. Under
16007 * memory debugging, it's a huge source of useless noise, so we
16008 * trade off slower shutdown for less distraction in the memory
16009 * reports. -baw
16010 */
16011 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016012#endif /* __INSURE__ */
16013
Victor Stinner3d483342019-11-22 12:27:50 +010016014 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016015
Victor Stinner3d483342019-11-22 12:27:50 +010016016 for (Py_ssize_t i = 0; i < 256; i++) {
16017 Py_CLEAR(unicode_latin1[i]);
16018 }
16019 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016020 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016021
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016022 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016023}
16024
16025
Georg Brandl66c221e2010-10-14 07:04:07 +000016026/* A _string module, to export formatter_parser and formatter_field_name_split
16027 to the string.Formatter class implemented in Python. */
16028
16029static PyMethodDef _string_methods[] = {
16030 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16031 METH_O, PyDoc_STR("split the argument as a field name")},
16032 {"formatter_parser", (PyCFunction) formatter_parser,
16033 METH_O, PyDoc_STR("parse the argument as a format string")},
16034 {NULL, NULL}
16035};
16036
16037static struct PyModuleDef _string_module = {
16038 PyModuleDef_HEAD_INIT,
16039 "_string",
16040 PyDoc_STR("string helper module"),
16041 0,
16042 _string_methods,
16043 NULL,
16044 NULL,
16045 NULL,
16046 NULL
16047};
16048
16049PyMODINIT_FUNC
16050PyInit__string(void)
16051{
16052 return PyModule_Create(&_string_module);
16053}
16054
16055
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016056#ifdef __cplusplus
16057}
16058#endif